diff --git a/.circleci/config.yml b/.circleci/config.yml index e947f30d285cd..6b516b21722ac 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,43 +1,6 @@ version: 2 jobs: - - # -------------------------------------------------------------------------- - # 0. py27_compat - # -------------------------------------------------------------------------- - py27_compat: - docker: - - image: continuumio/miniconda:latest - # databases configuration - - image: circleci/postgres:9.6.5-alpine-ram - environment: - POSTGRES_USER: postgres - POSTGRES_DB: pandas_nosetest - - image: circleci/mysql:8-ram - environment: - MYSQL_USER: "root" - MYSQL_HOST: "localhost" - MYSQL_ALLOW_EMPTY_PASSWORD: "true" - MYSQL_DATABASE: "pandas_nosetest" - environment: - JOB: "2.7_COMPAT" - ENV_FILE: "ci/circle-27-compat.yaml" - LOCALE_OVERRIDE: "it_IT.UTF-8" - MINICONDA_DIR: /home/ubuntu/miniconda3 - steps: - - checkout - - run: - name: build - command: | - ./ci/install_circle.sh - ./ci/show_circle.sh - - run: - name: test - command: ./ci/run_circle.sh --skip-slow --skip-network - - # -------------------------------------------------------------------------- - # 1. py36_locale - # -------------------------------------------------------------------------- - py36_locale: + build: docker: - image: continuumio/miniconda:latest # databases configuration @@ -54,41 +17,7 @@ jobs: environment: JOB: "3.6_LOCALE" - ENV_FILE: "ci/circle-36-locale.yaml" - LOCALE_OVERRIDE: "zh_CN.UTF-8" - MINICONDA_DIR: /home/ubuntu/miniconda3 - steps: - - checkout - - run: - name: build - command: | - ./ci/install_circle.sh - ./ci/show_circle.sh - - run: - name: test - command: ./ci/run_circle.sh --skip-slow --skip-network - - # -------------------------------------------------------------------------- - # 2. py36_locale_slow - # -------------------------------------------------------------------------- - py36_locale_slow: - docker: - - image: continuumio/miniconda:latest - # databases configuration - - image: circleci/postgres:9.6.5-alpine-ram - environment: - POSTGRES_USER: postgres - POSTGRES_DB: pandas_nosetest - - image: circleci/mysql:8-ram - environment: - MYSQL_USER: "root" - MYSQL_HOST: "localhost" - MYSQL_ALLOW_EMPTY_PASSWORD: "true" - MYSQL_DATABASE: "pandas_nosetest" - - environment: - JOB: "3.6_LOCALE_SLOW" - ENV_FILE: "ci/circle-36-locale_slow.yaml" + ENV_FILE: "ci/deps/circle-36-locale.yaml" LOCALE_OVERRIDE: "zh_CN.UTF-8" MINICONDA_DIR: /home/ubuntu/miniconda3 steps: @@ -96,52 +25,14 @@ jobs: - run: name: build command: | - ./ci/install_circle.sh - ./ci/show_circle.sh + ./ci/circle/install_circle.sh + export PATH="$MINICONDA_DIR/bin:$PATH" + source activate pandas-dev + python -c "import pandas; pandas.show_versions();" - run: name: test - command: ./ci/run_circle.sh --only-slow --skip-network - - # -------------------------------------------------------------------------- - # 3. py35_ascii - # -------------------------------------------------------------------------- - py35_ascii: - docker: - - image: continuumio/miniconda:latest - # databases configuration - - image: circleci/postgres:9.6.5-alpine-ram - environment: - POSTGRES_USER: postgres - POSTGRES_DB: pandas_nosetest - - image: circleci/mysql:8-ram - environment: - MYSQL_USER: "root" - MYSQL_HOST: "localhost" - MYSQL_ALLOW_EMPTY_PASSWORD: "true" - MYSQL_DATABASE: "pandas_nosetest" - - environment: - JOB: "3.5_ASCII" - ENV_FILE: "ci/circle-35-ascii.yaml" - LOCALE_OVERRIDE: "C" - MINICONDA_DIR: /home/ubuntu/miniconda3 - steps: - - checkout - - run: - name: build command: | - ./ci/install_circle.sh - ./ci/show_circle.sh - - run: - name: test - command: ./ci/run_circle.sh --skip-slow --skip-network - - -workflows: - version: 2 - build_and_test: - jobs: - - py27_compat - - py36_locale - - py36_locale_slow - - py35_ascii + export PATH="$MINICONDA_DIR/bin:$PATH" + source activate pandas-dev + echo "pytest -m "not slow and not network" --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml pandas" + pytest -m "not slow and not network" --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml pandas diff --git a/.gitignore b/.gitignore index a59f2843c365a..f912fedb199c0 100644 --- a/.gitignore +++ b/.gitignore @@ -109,6 +109,5 @@ doc/build/html/index.html # Windows specific leftover: doc/tmp.sv doc/source/styled.xlsx -doc/source/templates/ env/ doc/source/savefig/ diff --git a/.pep8speaks.yml b/.pep8speaks.yml index cd610907007eb..cbcb098c47125 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -3,9 +3,17 @@ scanner: diff_only: True # If True, errors caused by only the patch are shown +# Opened issue in pep8speaks, so we can directly use the config in setup.cfg +# (and avoid having to duplicate it here): +# https://github.com/OrkoHunter/pep8speaks/issues/95 + pycodestyle: max-line-length: 79 - ignore: # Errors and warnings to ignore + ignore: + - W503, # line break before binary operator + - W504, # line break after binary operator - E402, # module level import not at top of file - E731, # do not assign a lambda expression, use a def - - W503 # line break before binary operator + - C406, # Unnecessary list literal - rewrite as a dict literal. + - C408, # Unnecessary dict call - rewrite as a literal. + - C409 # Unnecessary list passed to tuple() - rewrite as a tuple literal. diff --git a/.travis.yml b/.travis.yml index c9bdb91283d42..03026647d6bb8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ env: git: # for cloning - depth: 1000 + depth: 1500 matrix: fast_finish: true @@ -34,29 +34,28 @@ matrix: include: - dist: trusty env: - - JOB="3.7" ENV_FILE="ci/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="not slow and not network" - dist: trusty env: - - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true + - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/deps/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" PATTERN="slow" addons: apt: packages: - language-pack-zh-hans - dist: trusty env: - - JOB="2.7, lint" ENV_FILE="ci/travis-27.yaml" TEST_ARGS="--skip-slow" LINT=true + - JOB="2.7" ENV_FILE="ci/deps/travis-27.yaml" PATTERN="not slow" addons: apt: packages: - python-gtk2 - dist: trusty env: - - JOB="3.6, coverage" ENV_FILE="ci/travis-36.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true DOCTEST=true - + - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36.yaml" PATTERN="not slow and not network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true - dist: trusty env: - - JOB="3.7, NumPy dev" ENV_FILE="ci/travis-37-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network -W error" PANDAS_TESTING_MODE="deprecate" + - JOB="3.7, NumPy dev" ENV_FILE="ci/deps/travis-37-numpydev.yaml" PATTERN="not slow and not network" TEST_ARGS="-W error" PANDAS_TESTING_MODE="deprecate" addons: apt: packages: @@ -65,19 +64,19 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true + - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" # In allow_failures - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true allow_failures: - dist: trusty env: - - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true + - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true before_install: - echo "before_install" @@ -106,25 +105,17 @@ before_script: script: - echo "script start" + - source activate pandas-dev - ci/run_build_docs.sh - - ci/script_single.sh - - ci/script_multi.sh - - ci/lint.sh - - ci/doctests.sh - - echo "checking imports" - - source activate pandas && python ci/check_imports.py - - echo "script done" - -after_success: - - ci/upload_coverage.sh + - ci/run_tests.sh after_script: - echo "after_script start" - - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - if [ -e /tmp/single.xml ]; then - ci/print_skipped.py /tmp/single.xml; + - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - if [ -e test-data-single.xml ]; then + ci/print_skipped.py test-data-single.xml; fi - - if [ -e /tmp/multiple.xml ]; then - ci/print_skipped.py /tmp/multiple.xml; + - if [ -e test-data-multiple.xml ]; then + ci/print_skipped.py test-data-multiple.xml; fi - echo "after_script done" diff --git a/LICENSES/MUSL_LICENSE b/LICENSES/MUSL_LICENSE new file mode 100644 index 0000000000000..a8833d4bc4744 --- /dev/null +++ b/LICENSES/MUSL_LICENSE @@ -0,0 +1,132 @@ +musl as a whole is licensed under the following standard MIT license: + +---------------------------------------------------------------------- +Copyright © 2005-2014 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +---------------------------------------------------------------------- + +Authors/contributors include: + +Anthony G. Basile +Arvid Picciani +Bobby Bingham +Boris Brezillon +Brent Cook +Chris Spiegel +Clément Vasseur +Emil Renner Berthing +Hiltjo Posthuma +Isaac Dunham +Jens Gustedt +Jeremy Huntwork +John Spencer +Justin Cormack +Luca Barbato +Luka Perkov +M Farkas-Dyck (Strake) +Michael Forney +Nicholas J. Kain +orc +Pascal Cuoq +Pierre Carrier +Rich Felker +Richard Pennington +sin +Solar Designer +Stefan Kristiansson +Szabolcs Nagy +Timo Teräs +Valentin Ochs +William Haddon + +Portions of this software are derived from third-party works licensed +under terms compatible with the above MIT license: + +The TRE regular expression implementation (src/regex/reg* and +src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed +under a 2-clause BSD license (license text in the source files). The +included version has been heavily modified by Rich Felker in 2012, in +the interests of size, simplicity, and namespace cleanliness. + +Much of the math library code (src/math/* and src/complex/*) is +Copyright © 1993,2004 Sun Microsystems or +Copyright © 2003-2011 David Schultz or +Copyright © 2003-2009 Steven G. Kargl or +Copyright © 2003-2009 Bruce D. Evans or +Copyright © 2008 Stephen L. Moshier +and labelled as such in comments in the individual source files. All +have been licensed under extremely permissive terms. + +The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008 +The Android Open Source Project and is licensed under a two-clause BSD +license. It was taken from Bionic libc, used on Android. + +The implementation of DES for crypt (src/misc/crypt_des.c) is +Copyright © 1994 David Burren. It is licensed under a BSD license. + +The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was +originally written by Solar Designer and placed into the public +domain. The code also comes with a fallback permissive license for use +in jurisdictions that may not recognize the public domain. + +The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 +Valentin Ochs and is licensed under an MIT-style license. + +The BSD PRNG implementation (src/prng/random.c) and XSI search API +(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and +licensed under following terms: "Permission to use, copy, modify, +and/or distribute this code for any purpose with or without fee is +hereby granted. There is no warranty." + +The x86_64 port was written by Nicholas J. Kain. Several files (crt) +were released into the public domain; others are licensed under the +standard MIT license terms at the top of this file. See individual +files for their copyright status. + +The mips and microblaze ports were originally written by Richard +Pennington for use in the ellcc project. The original code was adapted +by Rich Felker for build system and code conventions during upstream +integration. It is licensed under the standard MIT terms. + +The powerpc port was also originally written by Richard Pennington, +and later supplemented and integrated by John Spencer. It is licensed +under the standard MIT terms. + +All other files which have no copyright comments are original works +produced specifically for use as part of this library, written either +by Rich Felker, the main author of the library, or by one or more +contibutors listed above. Details on authorship of individual files +can be found in the git version control history of the project. The +omission of copyright and license comments in each file is in the +interest of source tree size. + +All public header files (include/* and arch/*/bits/*) should be +treated as Public Domain as they intentionally contain no content +which can be covered by copyright. Some source modules may fall in +this category as well. If you believe that a file is so trivial that +it should be in the Public Domain, please contact the authors and +request an explicit statement releasing it from copyright. + +The following files are trivial, believed not to be copyrightable in +the first place, and hereby explicitly released to the Public Domain: + +All public headers: include/*, arch/*/bits/* +Startup files: crt/* diff --git a/Makefile b/Makefile index 4a4aca21e1b78..d2bd067950fd0 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ build: clean_pyc python setup.py build_ext --inplace lint-diff: - git diff master --name-only -- "*.py" | grep -E "pandas|scripts" | xargs flake8 + git diff upstream/master --name-only -- "*.py" | xargs flake8 develop: build -python setup.py develop diff --git a/README.md b/README.md index b4dedecb4c697..1993b1ecb9dc1 100644 --- a/README.md +++ b/README.md @@ -171,7 +171,7 @@ pip install pandas ``` ## Dependencies -- [NumPy](https://www.numpy.org): 1.9.0 or higher +- [NumPy](https://www.numpy.org): 1.12.0 or higher - [python-dateutil](https://labix.org/python-dateutil): 2.5.0 or higher - [pytz](https://pythonhosted.org/pytz): 2011k or higher diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index fc34440ece2ed..7dcd7b284d66d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,10 +1,11 @@ -import warnings from importlib import import_module import numpy as np + import pandas as pd from pandas.util import testing as tm + for imp in ['pandas.util', 'pandas.tools.hashing']: try: hashing = import_module(imp) @@ -12,13 +13,9 @@ except (ImportError, TypeError, ValueError): pass -from .pandas_vb_common import setup # noqa - class Factorize(object): - goal_time = 0.2 - params = [True, False] param_names = ['sort'] @@ -40,8 +37,6 @@ def time_factorize_string(self, sort): class Duplicated(object): - goal_time = 0.2 - params = ['first', 'last', False] param_names = ['keep'] @@ -63,8 +58,6 @@ def time_duplicated_string(self, keep): class DuplicatedUniqueIndex(object): - goal_time = 0.2 - def setup(self): N = 10**5 self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) @@ -77,21 +70,13 @@ def time_duplicated_unique_int(self): class Match(object): - goal_time = 0.2 - def setup(self): self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) - def time_match_string(self): - with warnings.catch_warnings(record=True): - pd.match(self.all, self.uniques) - class Hashing(object): - goal_time = 0.2 - def setup_cache(self): N = 10**5 @@ -126,3 +111,6 @@ def time_series_timedeltas(self, df): def time_series_dates(self, df): hashing.hash_pandas_object(df['dates']) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index 48f0b7d71144c..d061755208c9e 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -5,13 +5,9 @@ except ImportError: from pandas.util.decorators import cache_readonly -from .pandas_vb_common import setup # noqa - class DataFrameAttributes(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index @@ -25,8 +21,6 @@ def time_set_index(self): class CacheReadonly(object): - goal_time = 0.2 - def setup(self): class Foo: @@ -38,3 +32,6 @@ def prop(self): def time_cache_readonly(self): self.obj.prop + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index cc8766e1fa39c..22b8ed80f3d07 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -6,13 +6,9 @@ except ImportError: import pandas.computation.expressions as expr -from .pandas_vb_common import setup # noqa - class Ops(object): - goal_time = 0.2 - params = [[True, False], ['default', 1]] param_names = ['use_numexpr', 'threads'] @@ -44,8 +40,6 @@ def teardown(self, use_numexpr, threads): class Ops2(object): - goal_time = 0.2 - def setup(self): N = 10**3 self.df = DataFrame(np.random.randn(N, N)) @@ -58,6 +52,8 @@ def setup(self): np.iinfo(np.int16).max, size=(N, N))) + self.s = Series(np.random.randn(N)) + # Division def time_frame_float_div(self): @@ -80,10 +76,19 @@ def time_frame_int_mod(self): def time_frame_float_mod(self): self.df % self.df2 + # Dot product -class Timeseries(object): + def time_frame_dot(self): + self.df.dot(self.df2) + + def time_series_dot(self): + self.s.dot(self.s) + + def time_frame_series_dot(self): + self.df.dot(self.s) - goal_time = 0.2 + +class Timeseries(object): params = [None, 'US/Eastern'] param_names = ['tz'] @@ -111,8 +116,6 @@ def time_timestamp_ops_diff_with_shift(self, tz): class AddOverflowScalar(object): - goal_time = 0.2 - params = [1, -1, 0] param_names = ['scalar'] @@ -126,8 +129,6 @@ def time_add_overflow_scalar(self, scalar): class AddOverflowArray(object): - goal_time = 0.2 - def setup(self): N = 10**6 self.arr = np.arange(N) @@ -149,3 +150,6 @@ def time_add_overflow_b_mask_nan(self): def time_add_overflow_both_arg_nan(self): checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 2a7717378c280..7318b40efc8fb 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -11,13 +11,9 @@ except ImportError: pass -from .pandas_vb_common import setup # noqa - class Concat(object): - goal_time = 0.2 - def setup(self): N = 10**5 self.s = pd.Series(list('aabbcd') * N).astype('category') @@ -34,8 +30,6 @@ def time_union(self): class Constructor(object): - goal_time = 0.2 - def setup(self): N = 10**5 self.categories = list('abcde') @@ -52,6 +46,8 @@ def setup(self): self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) self.values_all_int8 = np.ones(N, 'int8') + self.categorical = pd.Categorical(self.values, self.categories) + self.series = pd.Series(self.categorical) def time_regular(self): pd.Categorical(self.values, self.categories) @@ -74,10 +70,14 @@ def time_all_nan(self): def time_from_codes_all_int8(self): pd.Categorical.from_codes(self.values_all_int8, self.categories) + def time_existing_categorical(self): + pd.Categorical(self.categorical) -class ValueCounts(object): + def time_existing_series(self): + pd.Categorical(self.series) - goal_time = 0.2 + +class ValueCounts(object): params = [True, False] param_names = ['dropna'] @@ -93,8 +93,6 @@ def time_value_counts(self, dropna): class Repr(object): - goal_time = 0.2 - def setup(self): self.sel = pd.Series(['s1234']).astype('category') @@ -104,8 +102,6 @@ def time_rendering(self): class SetCategories(object): - goal_time = 0.2 - def setup(self): n = 5 * 10**5 arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] @@ -117,8 +113,6 @@ def time_set_categories(self): class Rank(object): - goal_time = 0.2 - def setup(self): N = 10**5 ncats = 100 @@ -156,8 +150,6 @@ def time_rank_int_cat_ordered(self): class Isin(object): - goal_time = 0.2 - params = ['object', 'int64'] param_names = ['dtype'] @@ -197,8 +189,6 @@ def time_categorical_series_is_monotonic_decreasing(self): class Contains(object): - goal_time = 0.2 - def setup(self): N = 10**5 self.ci = tm.makeCategoricalIndex(N) @@ -214,7 +204,6 @@ def time_categorical_contains(self): class CategoricalSlicing(object): - goal_time = 0.2 params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] param_names = ['index'] @@ -245,3 +234,6 @@ def time_getitem_list(self, index): def time_getitem_bool_array(self, index): self.data[self.data == self.cat_scalar] + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 3f9016787aab4..198ed1c90a2e9 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -2,13 +2,9 @@ import pandas.util.testing as tm from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex -from .pandas_vb_common import setup # noqa - class SeriesConstructors(object): - goal_time = 0.2 - param_names = ["data_fmt", "with_index"] params = [[lambda x: x, list, @@ -32,8 +28,6 @@ def time_series_constructor(self, data_fmt, with_index): class SeriesDtypesConstructors(object): - goal_time = 0.2 - def setup(self): N = 10**4 self.arr = np.random.randn(N, N) @@ -56,11 +50,12 @@ def time_dtindex_from_index_with_series(self): class MultiIndexConstructor(object): - goal_time = 0.2 - def setup(self): N = 10**4 self.iterables = [tm.makeStringIndex(N), range(20)] def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 8e581dcf22b4c..837478efbad64 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -5,13 +5,9 @@ except ImportError: import pandas.computation.expressions as expr -from .pandas_vb_common import setup # noqa - class Eval(object): - goal_time = 0.2 - params = [['numexpr', 'python'], [1, 'all']] param_names = ['engine', 'threads'] @@ -43,8 +39,6 @@ def teardown(self, engine, threads): class Query(object): - goal_time = 0.2 - def setup(self): N = 10**6 halfway = (N // 2) - 1 @@ -65,3 +59,6 @@ def time_query_datetime_column(self): def time_query_with_boolean_selection(self): self.df.query('(a >= @self.min_val) & (a <= @self.max_val)') + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 9def910df0bab..60f6a66e07a7b 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -7,13 +7,9 @@ # For compatibility with older versions from pandas.core.datetools import * # noqa -from .pandas_vb_common import setup # noqa - class FromDicts(object): - goal_time = 0.2 - def setup(self): N, K = 5000, 50 self.index = tm.makeStringIndex(N) @@ -47,8 +43,6 @@ def time_nested_dict_int64(self): class FromSeries(object): - goal_time = 0.2 - def setup(self): mi = MultiIndex.from_product([range(100), range(100)]) self.s = Series(np.random.randn(10000), index=mi) @@ -59,7 +53,6 @@ def time_mi_series(self): class FromDictwithTimestamp(object): - goal_time = 0.2 params = [Nano(1), Hour(1)] param_names = ['offset'] @@ -76,7 +69,6 @@ def time_dict_with_timestamp_offsets(self, offset): class FromRecords(object): - goal_time = 0.2 params = [None, 1000] param_names = ['nrows'] @@ -91,11 +83,12 @@ def time_frame_from_records_generator(self, nrows): class FromNDArray(object): - goal_time = 0.2 - def setup(self): N = 100000 self.data = np.random.randn(N) def time_frame_from_ndarray(self): self.df = DataFrame(self.data) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index f911d506b1f4f..3c0dd646aa502 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,24 +1,19 @@ import string -import warnings import numpy as np -import pandas.util.testing as tm -from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, - isnull, NaT) -from .pandas_vb_common import setup # noqa +from pandas import ( + DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range) +import pandas.util.testing as tm class GetNumericData(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' - with warnings.catch_warnings(record=True): - self.df = self.df.consolidate() + self.df = self.df._consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data() @@ -26,8 +21,6 @@ def time_frame_get_numeric_data(self): class Lookup(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) @@ -48,8 +41,6 @@ def time_frame_fancy_lookup_all(self): class Reindex(object): - goal_time = 0.2 - def setup(self): N = 10**3 self.df = DataFrame(np.random.randn(N * 10, N)) @@ -70,16 +61,41 @@ def time_reindex_axis1(self): def time_reindex_both_axes(self): self.df.reindex(index=self.idx, columns=self.idx) - def time_reindex_both_axes_ix(self): - self.df.ix[self.idx, self.idx] - def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) -class Iteration(object): +class Rename(object): + + def setup(self): + N = 10**3 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) + self.dict_idx = {k: k for k in self.idx} + self.df2 = DataFrame( + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) + + def time_rename_single(self): + self.df.rename({0: 0}) + + def time_rename_axis0(self): + self.df.rename(self.dict_idx) - goal_time = 0.2 + def time_rename_axis1(self): + self.df.rename(columns=self.dict_idx) + + def time_rename_both_axes(self): + self.df.rename(index=self.dict_idx, columns=self.dict_idx) + + def time_dict_rename_both_axes(self): + self.df.rename(index=self.dict_idx, columns=self.dict_idx) + + +class Iteration(object): def setup(self): N = 1000 @@ -114,8 +130,6 @@ def time_iterrows(self): class ToString(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(100, 10)) @@ -125,8 +139,6 @@ def time_to_string_floats(self): class ToHTML(object): - goal_time = 0.2 - def setup(self): nrows = 500 self.df2 = DataFrame(np.random.randn(nrows, 10)) @@ -139,8 +151,6 @@ def time_to_html_mixed(self): class Repr(object): - goal_time = 0.2 - def setup(self): nrows = 10000 data = np.random.randn(nrows, 10) @@ -166,8 +176,6 @@ def time_frame_repr_wide(self): class MaskBool(object): - goal_time = 0.2 - def setup(self): data = np.random.randn(1000, 500) df = DataFrame(data) @@ -184,8 +192,6 @@ def time_frame_mask_floats(self): class Isnull(object): - goal_time = 0.2 - def setup(self): N = 10**3 self.df_no_null = DataFrame(np.random.randn(N, N)) @@ -218,7 +224,6 @@ def time_isnull_obj(self): class Fillna(object): - goal_time = 0.2 params = ([True, False], ['pad', 'bfill']) param_names = ['inplace', 'method'] @@ -233,7 +238,6 @@ def time_frame_fillna(self, inplace, method): class Dropna(object): - goal_time = 0.2 params = (['all', 'any'], [0, 1]) param_names = ['how', 'axis'] @@ -254,8 +258,6 @@ def time_dropna_axis_mixed_dtypes(self, how, axis): class Count(object): - goal_time = 0.2 - params = [0, 1] param_names = ['axis'] @@ -284,8 +286,6 @@ def time_count_level_mixed_dtypes_multi(self, axis): class Apply(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(1000, 100)) @@ -314,8 +314,6 @@ def time_apply_ref_by_name(self): class Dtypes(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(1000, 1000)) @@ -325,8 +323,6 @@ def time_frame_dtypes(self): class Equals(object): - goal_time = 0.2 - def setup(self): N = 10**3 self.float_df = DataFrame(np.random.randn(N, N)) @@ -363,7 +359,6 @@ def time_frame_object_unequal(self): class Interpolate(object): - goal_time = 0.2 params = [None, 'infer'] param_names = ['downcast'] @@ -389,7 +384,6 @@ def time_interpolate_some_good(self, downcast): class Shift(object): # frame shift speedup issue-5609 - goal_time = 0.2 params = [0, 1] param_names = ['axis'] @@ -411,8 +405,6 @@ def time_frame_nunique(self): class Duplicated(object): - goal_time = 0.2 - def setup(self): n = (1 << 20) t = date_range('2015-01-01', freq='S', periods=(n // 64)) @@ -431,7 +423,6 @@ def time_frame_duplicated_wide(self): class XS(object): - goal_time = 0.2 params = [0, 1] param_names = ['axis'] @@ -445,7 +436,6 @@ def time_frame_xs(self, axis): class SortValues(object): - goal_time = 0.2 params = [True, False] param_names = ['ascending'] @@ -458,8 +448,6 @@ def time_frame_sort_values(self, ascending): class SortIndexByColumns(object): - goal_time = 0.2 - def setup(self): N = 10000 K = 10 @@ -473,7 +461,6 @@ def time_frame_sort_values_by_columns(self): class Quantile(object): - goal_time = 0.2 params = [0, 1] param_names = ['axis'] @@ -486,8 +473,6 @@ def time_frame_quantile(self, axis): class GetDtypeCounts(object): # 2807 - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(10, 10000)) @@ -500,7 +485,6 @@ def time_info(self): class NSort(object): - goal_time = 0.2 params = ['first', 'last', 'all'] param_names = ['keep'] @@ -523,8 +507,6 @@ def time_nsmallest_two_columns(self, keep): class Describe(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame({ 'a': np.random.randint(0, 100, int(1e6)), @@ -537,3 +519,6 @@ def time_series_describe(self): def time_dataframe_describe(self): self.df.describe() + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 21c1ccf46e1c4..6819a296c81df 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -23,12 +23,11 @@ def wrapper(fname): return fname return wrapper -from .pandas_vb_common import BaseIO, setup # noqa +from .pandas_vb_common import BaseIO class ParallelGroupbyMethods(object): - goal_time = 0.2 params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod', 'sum', 'var']) param_names = ['threads', 'method'] @@ -60,7 +59,6 @@ def time_loop(self, threads, method): class ParallelGroups(object): - goal_time = 0.2 params = [2, 4, 8] param_names = ['threads'] @@ -82,7 +80,6 @@ def time_get_groups(self, threads): class ParallelTake1D(object): - goal_time = 0.2 params = ['int64', 'float64'] param_names = ['dtype'] @@ -126,8 +123,6 @@ def time_kth_smallest(self): class ParallelDatetimeFields(object): - goal_time = 0.2 - def setup(self): if not have_real_test_parallel: raise NotImplementedError @@ -174,7 +169,6 @@ def run(period): class ParallelRolling(object): - goal_time = 0.2 params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std'] param_names = ['method'] @@ -273,3 +267,6 @@ def time_parallel(self, threads): def time_loop(self, threads): for i in range(threads): self.loop() + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index b51b41614bc49..59e43ee22afde 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,14 +1,14 @@ -import warnings -from string import ascii_letters -from itertools import product from functools import partial +from itertools import product +from string import ascii_letters +import warnings import numpy as np -from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, - TimeGrouper, Categorical, Timestamp) -import pandas.util.testing as tm -from .pandas_vb_common import setup # noqa +from pandas import ( + Categorical, DataFrame, MultiIndex, Series, TimeGrouper, Timestamp, + date_range, period_range) +import pandas.util.testing as tm method_blacklist = { @@ -22,8 +22,6 @@ class ApplyDictReturn(object): - goal_time = 0.2 - def setup(self): self.labels = np.arange(1000).repeat(10) self.data = Series(np.random.randn(len(self.labels))) @@ -35,8 +33,6 @@ def time_groupby_apply_dict_return(self): class Apply(object): - goal_time = 0.2 - def setup_cache(self): N = 10**4 labels = np.random.randint(0, 2000, size=N) @@ -69,8 +65,6 @@ def time_copy_overhead_single_col(self, df): class Groups(object): - goal_time = 0.2 - param_names = ['key'] params = ['int64_small', 'int64_large', 'object_small', 'object_large'] @@ -95,7 +89,6 @@ def time_series_groups(self, data, key): class GroupManyLabels(object): - goal_time = 0.2 params = [1, 1000] param_names = ['ncols'] @@ -111,8 +104,6 @@ def time_sum(self, ncols): class Nth(object): - goal_time = 0.2 - param_names = ['dtype'] params = ['float32', 'float64', 'datetime', 'object'] @@ -151,8 +142,6 @@ def time_series_nth(self, dtype): class DateAttributes(object): - goal_time = 0.2 - def setup(self): rng = date_range('1/1/2000', '12/31/2005', freq='H') self.year, self.month, self.day = rng.year, rng.month, rng.day @@ -164,8 +153,6 @@ def time_len_groupby_object(self): class Int64(object): - goal_time = 0.2 - def setup(self): arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5)) i = np.random.choice(len(arr), len(arr) * 5) @@ -182,8 +169,6 @@ def time_overflow(self): class CountMultiDtype(object): - goal_time = 0.2 - def setup_cache(self): n = 10000 offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') @@ -210,8 +195,6 @@ def time_multi_count(self, df): class CountMultiInt(object): - goal_time = 0.2 - def setup_cache(self): n = 10000 df = DataFrame({'key1': np.random.randint(0, 500, size=n), @@ -229,9 +212,7 @@ def time_multi_int_nunique(self, df): class AggFunctions(object): - goal_time = 0.2 - - def setup_cache(): + def setup_cache(self): N = 10**5 fac1 = np.array(['A', 'B', 'C'], dtype='O') fac2 = np.array(['one', 'two'], dtype='O') @@ -261,8 +242,6 @@ def time_different_python_functions_singlecol(self, df): class GroupStrings(object): - goal_time = 0.2 - def setup(self): n = 2 * 10**5 alpha = list(map(''.join, product(ascii_letters, repeat=4))) @@ -278,8 +257,6 @@ def time_multi_columns(self): class MultiColumn(object): - goal_time = 0.2 - def setup_cache(self): N = 10**5 key1 = np.tile(np.arange(100, dtype=object), 1000) @@ -307,8 +284,6 @@ def time_col_select_numpy_sum(self, df): class Size(object): - goal_time = 0.2 - def setup(self): n = 10**5 offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') @@ -336,8 +311,6 @@ def time_category_size(self): class GroupByMethods(object): - goal_time = 0.2 - param_names = ['dtype', 'method', 'application'] params = [['int', 'float', 'object', 'datetime'], ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', @@ -387,7 +360,6 @@ def time_dtype_as_field(self, dtype, method, application): class RankWithTies(object): # GH 21237 - goal_time = 0.2 param_names = ['dtype', 'tie_method'] params = [['float64', 'float32', 'int64', 'datetime64'], ['first', 'average', 'dense', 'min', 'max']] @@ -406,8 +378,6 @@ def time_rank_ties(self, dtype, tie_method): class Float32(object): # GH 13335 - goal_time = 0.2 - def setup(self): tmp1 = (np.random.random(10000) * 0.1).astype(np.float32) tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) @@ -421,8 +391,6 @@ def time_sum(self): class Categories(object): - goal_time = 0.2 - def setup(self): N = 10**5 arr = np.random.random(N) @@ -459,7 +427,6 @@ def time_groupby_extra_cat_nosort(self): class Datelike(object): # GH 14338 - goal_time = 0.2 params = ['period_range', 'date_range', 'date_range_tz'] param_names = ['grouper'] @@ -477,8 +444,6 @@ def time_sum(self, grouper): class SumBools(object): # GH 2692 - goal_time = 0.2 - def setup(self): N = 500 self.df = DataFrame({'ii': range(N), @@ -490,7 +455,6 @@ def time_groupby_sum_booleans(self): class SumMultiLevel(object): # GH 9049 - goal_time = 0.2 timeout = 120.0 def setup(self): @@ -505,14 +469,12 @@ def time_groupby_sum_multiindex(self): class Transform(object): - goal_time = 0.2 - def setup(self): n1 = 400 n2 = 250 index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], - labels=[np.repeat(range(n1), n2).tolist(), - list(range(n2)) * n1], + codes=[np.repeat(range(n1), n2).tolist(), + list(range(n2)) * n1], names=['lev1', 'lev2']) arr = np.random.randn(n1 * n2, 3) arr[::10000, 0] = np.nan @@ -553,8 +515,6 @@ def time_transform_multi_key4(self): class TransformBools(object): - goal_time = 0.2 - def setup(self): N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) @@ -569,8 +529,6 @@ def time_transform_mean(self): class TransformNaN(object): # GH 12737 - goal_time = 0.2 - def setup(self): self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10), 'B': np.nan, @@ -579,3 +537,6 @@ def setup(self): def time_first(self): self.df_nans.groupby('key').transform('first') + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index f1703e163917a..f76040921393f 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -3,12 +3,9 @@ from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex, Float64Index) -from .pandas_vb_common import setup # noqa - class SetOperations(object): - goal_time = 0.2 params = (['datetime', 'date_string', 'int', 'strings'], ['intersection', 'union', 'symmetric_difference']) param_names = ['dtype', 'method'] @@ -34,8 +31,6 @@ def time_operation(self, dtype, method): class SetDisjoint(object): - goal_time = 0.2 - def setup(self): N = 10**5 B = N + 20000 @@ -48,8 +43,6 @@ def time_datetime_difference_disjoint(self): class Datetime(object): - goal_time = 0.2 - def setup(self): self.dr = date_range('20000101', freq='D', periods=10000) @@ -86,8 +79,6 @@ def time_modulo(self, dtype): class Range(object): - goal_time = 0.2 - def setup(self): self.idx_inc = RangeIndex(start=0, stop=10**7, step=3) self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3) @@ -107,8 +98,6 @@ def time_min_trivial(self): class IndexAppend(object): - goal_time = 0.2 - def setup(self): N = 10000 @@ -138,7 +127,6 @@ def time_append_obj_list(self): class Indexing(object): - goal_time = 0.2 params = ['String', 'Float', 'Int'] param_names = ['dtype'] @@ -183,8 +171,6 @@ def time_get_loc_non_unique_sorted(self, dtype): class Float64IndexMethod(object): # GH 13166 - goal_time = 0.2 - def setup(self): N = 100000 a = np.arange(N) @@ -192,3 +178,6 @@ def setup(self): def time_get_loc(self): self.ind.get_loc(0) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 2850fa249725c..57ba9cd80e55c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -2,17 +2,16 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, DataFrame, MultiIndex, Panel, - Int64Index, Float64Index, IntervalIndex, - CategoricalIndex, IndexSlice, concat, date_range) -from .pandas_vb_common import setup # noqa +from pandas import (Series, DataFrame, Panel, MultiIndex, + Int64Index, UInt64Index, Float64Index, + IntervalIndex, CategoricalIndex, + IndexSlice, concat, date_range) class NumericSeriesIndexing(object): - goal_time = 0.2 params = [ - (Int64Index, Float64Index), + (Int64Index, UInt64Index, Float64Index), ('unique_monotonic_inc', 'nonunique_monotonic_inc'), ] param_names = ['index_dtype', 'index_structure'] @@ -82,7 +81,6 @@ def time_loc_slice(self, index, index_structure): class NonNumericSeriesIndexing(object): - goal_time = 0.2 params = [ ('string', 'datetime'), ('unique_monotonic_inc', 'nonunique_monotonic_inc'), @@ -118,8 +116,6 @@ def time_getitem_list_like(self, index, index_structure): class DataFrameStringIndexing(object): - goal_time = 0.2 - def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) @@ -152,8 +148,6 @@ def time_boolean_rows_object(self): class DataFrameNumericIndexing(object): - goal_time = 0.2 - def setup(self): self.idx_dupe = np.array(range(30)) * 99 self.df = DataFrame(np.random.randn(10000, 5)) @@ -178,7 +172,6 @@ def time_bool_indexer(self): class Take(object): - goal_time = 0.2 params = ['int', 'datetime'] param_names = ['index'] @@ -196,8 +189,6 @@ def time_take(self, index): class MultiIndexing(object): - goal_time = 0.2 - def setup(self): mi = MultiIndex.from_product([range(1000), range(1000)]) self.s = Series(np.random.randn(1000000), index=mi) @@ -226,8 +217,6 @@ def time_index_slice(self): class IntervalIndexing(object): - goal_time = 0.2 - def setup_cache(self): idx = IntervalIndex.from_breaks(np.arange(1000001)) monotonic = Series(np.arange(1000000), index=idx) @@ -248,7 +237,6 @@ def time_loc_list(self, monotonic): class CategoricalIndexIndexing(object): - goal_time = 0.2 params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] param_names = ['index'] @@ -291,8 +279,6 @@ def time_get_indexer_list(self, index): class PanelIndexing(object): - goal_time = 0.2 - def setup(self): with warnings.catch_warnings(record=True): self.p = Panel(np.random.randn(100, 100, 100)) @@ -305,8 +291,6 @@ def time_subset(self): class MethodLookup(object): - goal_time = 0.2 - def setup_cache(self): s = Series() return s @@ -323,8 +307,6 @@ def time_lookup_loc(self, s): class GetItemSingleColumn(object): - goal_time = 0.2 - def setup(self): self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A']) self.df_int_col = DataFrame(np.random.randn(3000, 1)) @@ -338,8 +320,6 @@ def time_frame_getitem_single_column_int(self): class AssignTimeseriesIndex(object): - goal_time = 0.2 - def setup(self): N = 100000 idx = date_range('1/1/2000', periods=N, freq='H') @@ -351,8 +331,6 @@ def time_frame_assign_timeseries_index(self): class InsertColumns(object): - goal_time = 0.2 - def setup(self): self.N = 10**3 self.df = DataFrame(index=range(self.N)) @@ -367,3 +345,6 @@ def time_assign_with_setitem(self): np.random.seed(1234) for i in range(100): self.df[i] = np.random.randn(self.N) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py new file mode 100644 index 0000000000000..f3d063ee31bc8 --- /dev/null +++ b/asv_bench/benchmarks/indexing_engines.py @@ -0,0 +1,64 @@ +import numpy as np + +from pandas._libs import index as libindex + + +def _get_numeric_engines(): + engine_names = [ + ('Int64Engine', np.int64), ('Int32Engine', np.int32), + ('Int16Engine', np.int16), ('Int8Engine', np.int8), + ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32), + ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8), + ('Float64Engine', np.float64), ('Float32Engine', np.float32), + ] + return [(getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name)] + + +class NumericEngineIndexing(object): + + params = [_get_numeric_engines(), + ['monotonic_incr', 'monotonic_decr', 'non_monotonic'], + ] + param_names = ['engine_and_dtype', 'index_type'] + + def setup(self, engine_and_dtype, index_type): + engine, dtype = engine_and_dtype + N = 10**5 + values = list([1] * N + [2] * N + [3] * N) + arr = { + 'monotonic_incr': np.array(values, dtype=dtype), + 'monotonic_decr': np.array(list(reversed(values)), + dtype=dtype), + 'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype), + }[index_type] + + self.data = engine(lambda: arr, len(arr)) + # code belows avoids populating the mapping etc. while timing. + self.data.get_loc(2) + + def time_get_loc(self, engine_and_dtype, index_type): + self.data.get_loc(2) + + +class ObjectEngineIndexing(object): + + params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')] + param_names = ['index_type'] + + def setup(self, index_type): + N = 10**5 + values = list('a' * N + 'b' * N + 'c' * N) + arr = { + 'monotonic_incr': np.array(values, dtype=object), + 'monotonic_decr': np.array(list(reversed(values)), dtype=object), + 'non_monotonic': np.array(list('abc') * N, dtype=object), + }[index_type] + + self.data = libindex.ObjectEngine(lambda: arr, len(arr)) + # code belows avoids populating the mapping etc. while timing. + self.data.get_loc('b') + + def time_get_loc(self, index_type): + self.data.get_loc('b') diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 16d9e7cd73cbb..423bd02b93596 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -2,12 +2,11 @@ import pandas.util.testing as tm from pandas import DataFrame, Series, to_numeric -from .pandas_vb_common import numeric_dtypes, lib, setup # noqa +from .pandas_vb_common import numeric_dtypes, lib class NumericInferOps(object): # from GH 7332 - goal_time = 0.2 params = numeric_dtypes param_names = ['dtype'] @@ -34,8 +33,6 @@ def time_modulo(self, dtype): class DateInferOps(object): # from GH 7332 - goal_time = 0.2 - def setup_cache(self): N = 5 * 10**5 df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')}) @@ -54,7 +51,6 @@ def time_add_timedeltas(self, df): class ToNumeric(object): - goal_time = 0.2 params = ['ignore', 'coerce'] param_names = ['errors'] @@ -111,3 +107,6 @@ def setup_cache(self): def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 12cb893462b87..771f2795334e1 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -6,12 +6,11 @@ from pandas import DataFrame, Categorical, date_range, read_csv from pandas.compat import cStringIO as StringIO -from ..pandas_vb_common import setup, BaseIO # noqa +from ..pandas_vb_common import BaseIO class ToCSV(BaseIO): - goal_time = 0.2 fname = '__test__.csv' params = ['wide', 'long', 'mixed'] param_names = ['kind'] @@ -41,7 +40,6 @@ def time_frame(self, kind): class ToCSVDatetime(BaseIO): - goal_time = 0.2 fname = '__test__.csv' def setup(self): @@ -61,7 +59,6 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): - goal_time = 0.2 params = ([True, False], ['custom', 'iso8601', 'ymd']) param_names = ['infer_datetime_format', 'format'] @@ -82,7 +79,6 @@ def time_read_csv(self, infer_datetime_format, format): class ReadCSVSkipRows(BaseIO): - goal_time = 0.2 fname = '__test__.csv' params = [None, 10000] param_names = ['skiprows'] @@ -104,8 +100,6 @@ def time_skipprows(self, skiprows): class ReadUint64Integers(StringIORewind): - goal_time = 0.2 - def setup(self): self.na_values = [2**63 + 500] arr = np.arange(10000).astype('uint64') + 2**63 @@ -127,7 +121,6 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): - goal_time = 0.2 fname = '__test__.csv' params = ([',', '|'], [None, ',']) param_names = ['sep', 'thousands'] @@ -149,8 +142,6 @@ def time_thousands(self, sep, thousands): class ReadCSVComment(StringIORewind): - goal_time = 0.2 - def setup(self): data = ['A,B,C'] + (['1,2,3 # comment'] * 100000) self.StringIO_input = StringIO('\n'.join(data)) @@ -162,7 +153,6 @@ def time_comment(self): class ReadCSVFloatPrecision(StringIORewind): - goal_time = 0.2 params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip']) param_names = ['sep', 'decimal', 'float_precision'] @@ -185,7 +175,6 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): class ReadCSVCategorical(BaseIO): - goal_time = 0.2 fname = '__test__.csv' def setup(self): @@ -203,8 +192,6 @@ def time_convert_direct(self): class ReadCSVParseDates(StringIORewind): - goal_time = 0.2 - def setup(self): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n @@ -225,3 +212,6 @@ def time_baseline(self): read_csv(self.data(self.StringIO_input), sep=',', header=None, parse_dates=[1], names=list(string.digits[:9])) + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 58ab6bb8046c5..1bee864fbcf2d 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -3,12 +3,9 @@ from pandas.compat import BytesIO import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO, setup # noqa - class Excel(object): - goal_time = 0.2 params = ['openpyxl', 'xlsxwriter', 'xlwt'] param_names = ['engine'] @@ -34,3 +31,6 @@ def time_write_excel(self, engine): writer_write = ExcelWriter(bio_write, engine=engine) self.df.to_excel(writer_write, sheet_name='Sheet1') writer_write.save() + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 4b6e1d69af92d..f08904ba70a5f 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -4,13 +4,11 @@ from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO, setup # noqa +from ..pandas_vb_common import BaseIO class HDFStoreDataFrame(BaseIO): - goal_time = 0.2 - def setup(self): N = 25000 index = tm.makeStringIndex(N) @@ -103,8 +101,6 @@ def time_store_info(self): class HDFStorePanel(BaseIO): - goal_time = 0.2 - def setup(self): self.fname = '__test__.h5' with warnings.catch_warnings(record=True): @@ -130,7 +126,6 @@ def time_write_store_table_panel(self): class HDF(BaseIO): - goal_time = 0.2 params = ['table', 'fixed'] param_names = ['format'] @@ -149,3 +144,6 @@ def time_read_hdf(self, format): def time_write_hdf(self, format): self.df.to_hdf(self.fname, 'df', format=format) + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index acfdd327c3b51..ec2ddc11b7c1d 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -2,12 +2,11 @@ import pandas.util.testing as tm from pandas import DataFrame, date_range, timedelta_range, concat, read_json -from ..pandas_vb_common import setup, BaseIO # noqa +from ..pandas_vb_common import BaseIO class ReadJSON(BaseIO): - goal_time = 0.2 fname = "__test__.json" params = (['split', 'index', 'records'], ['int', 'datetime']) param_names = ['orient', 'index'] @@ -27,7 +26,6 @@ def time_read_json(self, orient, index): class ReadJSONLines(BaseIO): - goal_time = 0.2 fname = "__test_lines__.json" params = ['int', 'datetime'] param_names = ['index'] @@ -58,7 +56,6 @@ def peakmem_read_json_lines_concat(self, index): class ToJSON(BaseIO): - goal_time = 0.2 fname = "__test__.json" params = ['split', 'columns', 'index'] param_names = ['orient'] @@ -125,3 +122,6 @@ def time_float_int_lines(self, orient): def time_float_int_str_lines(self, orient): self.df_int_float_str.to_json(self.fname, orient='records', lines=True) + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index 8ccce01117ca4..dc2642d920fd0 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -2,13 +2,11 @@ from pandas import DataFrame, date_range, read_msgpack import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO, setup # noqa +from ..pandas_vb_common import BaseIO class MSGPack(BaseIO): - goal_time = 0.2 - def setup(self): self.fname = '__test__.msg' N = 100000 @@ -24,3 +22,6 @@ def time_read_msgpack(self): def time_write_msgpack(self): self.df.to_msgpack(self.fname) + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 2ad0fcca6eb26..74a58bbb946aa 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -2,13 +2,11 @@ from pandas import DataFrame, date_range, read_pickle import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO, setup # noqa +from ..pandas_vb_common import BaseIO class Pickle(BaseIO): - goal_time = 0.2 - def setup(self): self.fname = '__test__.pkl' N = 100000 @@ -24,3 +22,6 @@ def time_read_pickle(self): def time_write_pickle(self): self.df.to_pickle(self.fname) + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py index 526c524de7fff..2783f42cad895 100644 --- a/asv_bench/benchmarks/io/sas.py +++ b/asv_bench/benchmarks/io/sas.py @@ -5,7 +5,6 @@ class SAS(object): - goal_time = 0.2 params = ['sas7bdat', 'xport'] param_names = ['format'] diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index ef4e501e5f3b9..075d3bdda5ed9 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -5,12 +5,9 @@ from pandas import DataFrame, date_range, read_sql_query, read_sql_table from sqlalchemy import create_engine -from ..pandas_vb_common import setup # noqa - class SQL(object): - goal_time = 0.2 params = ['sqlalchemy', 'sqlite'] param_names = ['connection'] @@ -43,7 +40,6 @@ def time_read_sql_query(self, connection): class WriteSQLDtypes(object): - goal_time = 0.2 params = (['sqlalchemy', 'sqlite'], ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']) param_names = ['connection', 'dtype'] @@ -77,8 +73,6 @@ def time_read_sql_query_select_column(self, connection, dtype): class ReadSQLTable(object): - goal_time = 0.2 - def setup(self): N = 10000 self.table_name = 'test' @@ -106,8 +100,6 @@ def time_read_sql_table_parse_dates(self): class ReadSQLTableDtypes(object): - goal_time = 0.2 - params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'] param_names = ['dtype'] @@ -130,3 +122,6 @@ def setup(self, dtype): def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index e0f5752ca930f..a7f854a853f50 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -2,12 +2,11 @@ from pandas import DataFrame, date_range, read_stata import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO, setup # noqa +from ..pandas_vb_common import BaseIO class Stata(BaseIO): - goal_time = 0.2 params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'] param_names = ['convert_dates'] @@ -35,3 +34,6 @@ def time_read_stata(self, convert_dates): def time_write_stata(self, convert_dates): self.df.to_stata(self.fname, self.convert_dates) + + +from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 6624c3d0aaf49..88a59fea375ea 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -11,13 +11,9 @@ except ImportError: from pandas import ordered_merge as merge_ordered -from .pandas_vb_common import setup # noqa - class Append(object): - goal_time = 0.2 - def setup(self): self.df1 = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) @@ -27,11 +23,7 @@ def setup(self): self.mdf1['obj1'] = 'bar' self.mdf1['obj2'] = 'bar' self.mdf1['int1'] = 5 - try: - with warnings.catch_warnings(record=True): - self.mdf1.consolidate(inplace=True) - except (AttributeError, TypeError): - pass + self.mdf1 = self.mdf1._consolidate() self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index @@ -44,7 +36,6 @@ def time_append_mixed(self): class Concat(object): - goal_time = 0.2 params = [0, 1] param_names = ['axis'] @@ -73,7 +64,6 @@ def time_concat_empty_left(self, axis): class ConcatPanels(object): - goal_time = 0.2 params = ([0, 1, 2], [True, False]) param_names = ['axis', 'ignore_index'] @@ -99,7 +89,6 @@ def time_f_ordered(self, axis, ignore_index): class ConcatDataFrames(object): - goal_time = 0.2 params = ([0, 1], [True, False]) param_names = ['axis', 'ignore_index'] @@ -120,23 +109,22 @@ def time_f_ordered(self, axis, ignore_index): class Join(object): - goal_time = 0.2 params = [True, False] param_names = ['sort'] def setup(self, sort): level1 = tm.makeStringIndex(10).values level2 = tm.makeStringIndex(1000).values - label1 = np.arange(10).repeat(1000) - label2 = np.tile(np.arange(1000), 10) + codes1 = np.arange(10).repeat(1000) + codes2 = np.tile(np.arange(1000), 10) index2 = MultiIndex(levels=[level1, level2], - labels=[label1, label2]) + codes=[codes1, codes2]) self.df_multi = DataFrame(np.random.randn(len(index2), 4), index=index2, columns=['A', 'B', 'C', 'D']) - self.key1 = np.tile(level1.take(label1), 10) - self.key2 = np.tile(level2.take(label2), 10) + self.key1 = np.tile(level1.take(codes1), 10) + self.key2 = np.tile(level2.take(codes2), 10) self.df = DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, @@ -168,8 +156,6 @@ def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): class JoinIndex(object): - goal_time = 0.2 - def setup(self): N = 50000 self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)), @@ -184,8 +170,6 @@ def time_left_outer_join_index(self): class JoinNonUnique(object): # outer join of non-unique # GH 6329 - goal_time = 0.2 - def setup(self): date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') daily_dates = date_index.to_period('D').to_timestamp('S', 'S') @@ -202,7 +186,6 @@ def time_join_non_unique_equal(self): class Merge(object): - goal_time = 0.2 params = [True, False] param_names = ['sort'] @@ -237,7 +220,6 @@ def time_merge_dataframe_integer_key(self, sort): class I8Merge(object): - goal_time = 0.2 params = ['inner', 'outer', 'left', 'right'] param_names = ['how'] @@ -256,8 +238,6 @@ def time_i8merge(self, how): class MergeCategoricals(object): - goal_time = 0.2 - def setup(self): self.left_object = DataFrame( {'X': np.random.choice(range(0, 10), size=(10000,)), @@ -345,8 +325,6 @@ def time_multiby(self): class Align(object): - goal_time = 0.2 - def setup(self): size = 5 * 10**5 rng = np.arange(0, 10**13, 10**7) @@ -361,3 +339,6 @@ def time_series_align_int64_index(self): def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join='left') + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 0c92214795557..adc6730dcd946 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -4,13 +4,9 @@ import pandas.util.testing as tm from pandas import date_range, MultiIndex -from .pandas_vb_common import setup # noqa - class GetLoc(object): - goal_time = 0.2 - def setup(self): self.mi_large = MultiIndex.from_product( [np.arange(1000), np.arange(20), list(string.ascii_letters)], @@ -46,8 +42,6 @@ def time_small_get_loc_warm(self): class Duplicates(object): - goal_time = 0.2 - def setup(self): size = 65536 arrays = [np.random.randint(0, 8192, size), @@ -62,8 +56,6 @@ def time_remove_unused_levels(self): class Integer(object): - goal_time = 0.2 - def setup(self): self.mi_int = MultiIndex.from_product([np.arange(1000), np.arange(1000)], @@ -82,15 +74,13 @@ def time_is_monotonic(self): class Duplicated(object): - goal_time = 0.2 - def setup(self): n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] - labels = [np.random.choice(n, (k * n)) for lev in levels] - self.mi = MultiIndex(levels=levels, labels=labels) + codes = [np.random.choice(n, (k * n)) for lev in levels] + self.mi = MultiIndex(levels=levels, codes=codes) def time_duplicated(self): self.mi.duplicated() @@ -98,8 +88,6 @@ def time_duplicated(self): class Sortlevel(object): - goal_time = 0.2 - def setup(self): n = 1182720 low, high = -4096, 4096 @@ -124,8 +112,6 @@ def time_sortlevel_one(self): class Values(object): - goal_time = 0.2 - def setup_cache(self): level1 = range(1000) @@ -138,3 +124,6 @@ def time_datetime_level_values_copy(self, mi): def time_datetime_level_values_sliced(self, mi): mi[:10].values + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index e161b887ee86f..4570e73cccc71 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -34,8 +34,6 @@ class ApplyIndex(object): - goal_time = 0.2 - params = other_offsets param_names = ['offset'] @@ -49,8 +47,6 @@ def time_apply_index(self, offset): class OnOffset(object): - goal_time = 0.2 - params = offsets param_names = ['offset'] @@ -67,7 +63,6 @@ def time_on_offset(self, offset): class OffsetSeriesArithmetic(object): - goal_time = 0.2 params = offsets param_names = ['offset'] @@ -83,7 +78,6 @@ def time_add_offset(self, offset): class OffsetDatetimeIndexArithmetic(object): - goal_time = 0.2 params = offsets param_names = ['offset'] @@ -98,7 +92,6 @@ def time_add_offset(self, offset): class OffestDatetimeArithmetic(object): - goal_time = 0.2 params = offsets param_names = ['offset'] diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index 4614bbd198afa..47b3ad612f9b1 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -3,12 +3,8 @@ from pandas import DataFrame, Panel, DatetimeIndex, date_range -from .pandas_vb_common import setup # noqa - class DifferentIndexes(object): - goal_time = 0.2 - def setup(self): self.data_frames = {} start = datetime(1990, 1, 1) @@ -26,8 +22,6 @@ def time_from_dict(self): class SameIndexes(object): - goal_time = 0.2 - def setup(self): idx = DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), @@ -42,8 +36,6 @@ def time_from_dict(self): class TwoIndexes(object): - goal_time = 0.2 - def setup(self): start = datetime(1990, 1, 1) end = datetime(2012, 1, 1) @@ -58,3 +50,6 @@ def setup(self): def time_from_dict(self): with warnings.catch_warnings(record=True): Panel.from_dict(self.data_frames) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index 4d19e9a87c507..a4c12c082236e 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -3,12 +3,9 @@ import numpy as np from pandas import Panel -from .pandas_vb_common import setup # noqa - class PanelMethods(object): - goal_time = 0.2 params = ['items', 'major', 'minor'] param_names = ['axis'] @@ -23,3 +20,6 @@ def time_pct_change(self, axis): def time_shift(self, axis): with warnings.catch_warnings(record=True): self.panel.shift(1, axis=axis) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index c34f9a737473e..1af1ba1fb7b0b 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -37,14 +37,13 @@ def time_asfreq(self, freq): class PeriodIndexConstructor(object): - goal_time = 0.2 - params = ['D'] param_names = ['freq'] def setup(self, freq): self.rng = date_range('1985', periods=1000) self.rng2 = date_range('1985', periods=1000).to_pydatetime() + self.ints = list(range(2000, 3000)) def time_from_date_range(self, freq): PeriodIndex(self.rng, freq=freq) @@ -52,10 +51,11 @@ def time_from_date_range(self, freq): def time_from_pydatetime(self, freq): PeriodIndex(self.rng2, freq=freq) + def time_from_ints(self, freq): + PeriodIndex(self.ints, freq=freq) -class DataFramePeriodColumn(object): - goal_time = 0.2 +class DataFramePeriodColumn(object): def setup(self): self.rng = period_range(start='1/1/1990', freq='S', periods=20000) @@ -72,8 +72,6 @@ def time_set_index(self): class Algorithms(object): - goal_time = 0.2 - params = ['index', 'series'] param_names = ['typ'] @@ -95,8 +93,6 @@ def time_value_counts(self, typ): class Indexing(object): - goal_time = 0.2 - def setup(self): self.index = PeriodIndex(start='1985', periods=1000, freq='D') self.series = Series(range(1000), index=self.index) @@ -119,3 +115,6 @@ def time_align(self): def time_intersection(self): self.index[:750].intersection(self.index[250:]) + + def time_unique(self): + self.index.unique() diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 5b49112b0e07d..4f0bbb1690d4b 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -7,27 +7,52 @@ import matplotlib matplotlib.use('Agg') -from .pandas_vb_common import setup # noqa +class SeriesPlotting(object): + params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']] + param_names = ['kind'] -class Plotting(object): + def setup(self, kind): + if kind in ['bar', 'barh', 'pie']: + n = 100 + elif kind in ['kde']: + n = 10000 + else: + n = 1000000 - goal_time = 0.2 + self.s = Series(np.random.randn(n)) + if kind in ['area', 'pie']: + self.s = self.s.abs() - def setup(self): - self.s = Series(np.random.randn(1000000)) - self.df = DataFrame({'col': self.s}) + def time_series_plot(self, kind): + self.s.plot(kind=kind) - def time_series_plot(self): - self.s.plot() - def time_frame_plot(self): - self.df.plot() +class FramePlotting(object): + params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter', + 'hexbin']] + param_names = ['kind'] + def setup(self, kind): + if kind in ['bar', 'barh', 'pie']: + n = 100 + elif kind in ['kde', 'scatter', 'hexbin']: + n = 10000 + else: + n = 1000000 -class TimeseriesPlotting(object): + self.x = Series(np.random.randn(n)) + self.y = Series(np.random.randn(n)) + if kind in ['area', 'pie']: + self.x = self.x.abs() + self.y = self.y.abs() + self.df = DataFrame({'x': self.x, 'y': self.y}) - goal_time = 0.2 + def time_frame_plot(self, kind): + self.df.plot(x='x', y='y', kind=kind) + + +class TimeseriesPlotting(object): def setup(self): N = 2000 @@ -52,8 +77,6 @@ def time_plot_irregular(self): class Misc(object): - goal_time = 0.6 - def setup(self): N = 500 M = 10 @@ -62,3 +85,6 @@ def setup(self): def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 413427a16f40b..576dc495eb984 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -2,13 +2,11 @@ import pandas.util.testing as tm from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index, date_range) -from .pandas_vb_common import setup, lib # noqa +from .pandas_vb_common import lib class Reindex(object): - goal_time = 0.2 - def setup(self): rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') self.df = DataFrame(np.random.rand(10000, 10), index=rng, @@ -37,7 +35,6 @@ def time_reindex_multiindex(self): class ReindexMethod(object): - goal_time = 0.2 params = ['pad', 'backfill'] param_names = ['method'] @@ -52,7 +49,6 @@ def time_reindex_method(self, method): class Fillna(object): - goal_time = 0.2 params = ['pad', 'backfill'] param_names = ['method'] @@ -72,14 +68,12 @@ def time_float_32(self, method): class LevelAlign(object): - goal_time = 0.2 - def setup(self): self.index = MultiIndex( levels=[np.arange(10), np.arange(100), np.arange(100)], - labels=[np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)]) + codes=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) self.df_level = DataFrame(np.random.randn(100, 4), @@ -94,7 +88,6 @@ def time_reindex_level(self): class DropDuplicates(object): - goal_time = 0.2 params = [True, False] param_names = ['inplace'] @@ -139,8 +132,6 @@ def time_frame_drop_dups_bool(self, inplace): class Align(object): # blog "pandas escaped the zoo" - goal_time = 0.2 - def setup(self): n = 50000 indices = tm.makeStringIndex(n) @@ -156,8 +147,6 @@ def time_align_series_irregular_string(self): class LibFastZip(object): - goal_time = 0.2 - def setup(self): N = 10000 K = 10 @@ -170,3 +159,6 @@ def setup(self): def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 41208125e8f32..d8efaf99e2c4d 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -1,12 +1,9 @@ import numpy as np import pandas as pd -from .pandas_vb_common import setup # noqa - class FillNa(object): - goal_time = 0.2 params = [True, False] param_names = ['inplace'] @@ -26,7 +23,6 @@ def time_replace(self, inplace): class ReplaceDict(object): - goal_time = 0.2 params = [True, False] param_names = ['inplace'] @@ -42,7 +38,6 @@ def time_replace_series(self, inplace): class Convert(object): - goal_time = 0.5 params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta']) param_names = ['constructor', 'replace_data'] @@ -56,3 +51,6 @@ def setup(self, constructor, replace_data): def time_replace(self, constructor, replace_data): self.data.replace(self.to_replace) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 3cf9a32dab398..e5c2f54263a3c 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -5,13 +5,9 @@ from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long import pandas as pd -from .pandas_vb_common import setup # noqa - class Melt(object): - goal_time = 0.2 - def setup(self): self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C']) self.df['id1'] = np.random.randint(0, 10, 10000) @@ -23,8 +19,6 @@ def time_melt_dataframe(self): class Pivot(object): - goal_time = 0.2 - def setup(self): N = 10000 index = date_range('1/1/2000', periods=N, freq='h') @@ -39,8 +33,6 @@ def time_reshape_pivot_time_series(self): class SimpleReshape(object): - goal_time = 0.2 - def setup(self): arrays = [np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)] @@ -57,30 +49,38 @@ def time_unstack(self): class Unstack(object): - goal_time = 0.2 + params = ['int', 'category'] - def setup(self): + def setup(self, dtype): m = 100 n = 1000 levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) - values = np.arange(m * m * n).reshape(m * m, n) + if dtype == 'int': + values = np.arange(m * m * n).reshape(m * m, n) + else: + # the category branch is ~20x slower than int. So we + # cut down the size a bit. Now it's only ~3x slower. + n = 50 + columns = columns[:n] + indices = np.random.randint(0, 52, size=(m * m, n)) + values = np.take(list(string.ascii_letters), indices) + values = [pd.Categorical(v) for v in values.T] + self.df = DataFrame(values, index, columns) self.df2 = self.df.iloc[:-1] - def time_full_product(self): + def time_full_product(self, dtype): self.df.unstack() - def time_without_last_row(self): + def time_without_last_row(self, dtype): self.df2.unstack() class SparseIndex(object): - goal_time = 0.2 - def setup(self): NUM_ROWS = 1000 self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS), @@ -97,8 +97,6 @@ def time_unstack(self): class WideToLong(object): - goal_time = 0.2 - def setup(self): nyrs = 20 nidvars = 20 @@ -117,8 +115,6 @@ def time_wide_to_long_big(self): class PivotTable(object): - goal_time = 0.2 - def setup(self): N = 100000 fac1 = np.array(['A', 'B', 'C'], dtype='O') @@ -137,8 +133,6 @@ def time_pivot_table(self): class GetDummies(object): - goal_time = 0.2 - def setup(self): categories = list(string.ascii_letters[:12]) s = pd.Series(np.random.choice(categories, size=1000000), @@ -150,3 +144,44 @@ def time_get_dummies_1d(self): def time_get_dummies_1d_sparse(self): pd.get_dummies(self.s, sparse=True) + + +class Cut(object): + params = [[4, 10, 1000]] + param_names = ['bins'] + + def setup(self, bins): + N = 10**5 + self.int_series = pd.Series(np.arange(N).repeat(5)) + self.float_series = pd.Series(np.random.randn(N).repeat(5)) + self.timedelta_series = pd.Series(np.random.randint(N, size=N), + dtype='timedelta64[ns]') + self.datetime_series = pd.Series(np.random.randint(N, size=N), + dtype='datetime64[ns]') + + def time_cut_int(self, bins): + pd.cut(self.int_series, bins) + + def time_cut_float(self, bins): + pd.cut(self.float_series, bins) + + def time_cut_timedelta(self, bins): + pd.cut(self.timedelta_series, bins) + + def time_cut_datetime(self, bins): + pd.cut(self.datetime_series, bins) + + def time_qcut_int(self, bins): + pd.qcut(self.int_series, bins) + + def time_qcut_float(self, bins): + pd.qcut(self.float_series, bins) + + def time_qcut_timedelta(self, bins): + pd.qcut(self.timedelta_series, bins) + + def time_qcut_datetime(self, bins): + pd.qcut(self.datetime_series, bins) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index e3bf551fa5f2b..659b6591fbd4b 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,8 +1,6 @@ import pandas as pd import numpy as np -from .pandas_vb_common import setup # noqa - class Methods(object): @@ -23,6 +21,42 @@ def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() +class ExpandingMethods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.expanding = getattr(pd, constructor)(arr).expanding() + + def time_expanding(self, constructor, dtype, method): + getattr(self.expanding, method)() + + +class EWMMethods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + ['mean', 'std']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) + + def time_ewm(self, constructor, window, dtype, method): + getattr(self.ewm, method)() + + class VariableWindowMethods(Methods): sample_time = 0.2 params = (['DataFrame', 'Series'], @@ -77,3 +111,6 @@ def setup(self, constructor, window, dtype, percentile, interpolation): def time_quantile(self, constructor, window, dtype, percentile, interpolation): self.roll.quantile(percentile, interpolation=interpolation) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a26c5d89bc483..46fb5011cc1a5 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -4,12 +4,9 @@ import pandas.util.testing as tm from pandas import Series, date_range, NaT -from .pandas_vb_common import setup # noqa - class SeriesConstructor(object): - goal_time = 0.2 params = [None, 'dict'] param_names = ['data'] @@ -26,7 +23,6 @@ def time_constructor(self, data): class IsIn(object): - goal_time = 0.2 params = ['int64', 'object'] param_names = ['dtype'] @@ -98,7 +94,6 @@ def time_isin_long_series_long_values_floats(self): class NSort(object): - goal_time = 0.2 params = ['first', 'last', 'all'] param_names = ['keep'] @@ -114,7 +109,6 @@ def time_nsmallest(self, keep): class Dropna(object): - goal_time = 0.2 params = ['int', 'datetime'] param_names = ['dtype'] @@ -132,7 +126,6 @@ def time_dropna(self, dtype): class Map(object): - goal_time = 0.2 params = ['dict', 'Series'] param_names = 'mapper' @@ -148,8 +141,6 @@ def time_map(self, mapper): class Clip(object): - goal_time = 0.2 - def setup(self): self.s = Series(np.random.randn(50)) @@ -159,7 +150,6 @@ def time_clip(self): class ValueCounts(object): - goal_time = 0.2 params = ['int', 'float', 'object'] param_names = ['dtype'] @@ -172,8 +162,6 @@ def time_value_counts(self, dtype): class Dir(object): - goal_time = 0.2 - def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) @@ -183,8 +171,6 @@ def time_dir_strings(self): class SeriesGetattr(object): # https://github.com/pandas-dev/pandas/issues/19764 - goal_time = 0.2 - def setup(self): self.s = Series(1, index=date_range("2012-01-01", freq='s', @@ -192,3 +178,6 @@ def setup(self): def time_series_datetimeindex_repr(self): getattr(self.s, 'a', None) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index dcb7694abc2ad..64f87c1670170 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -5,8 +5,6 @@ from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series, date_range, MultiIndex) -from .pandas_vb_common import setup # noqa - def make_array(size, dense_proportion, fill_value, dtype): dense_size = int(size * dense_proportion) @@ -18,8 +16,6 @@ def make_array(size, dense_proportion, fill_value, dtype): class SparseSeriesToFrame(object): - goal_time = 0.2 - def setup(self): K = 50 N = 50001 @@ -37,7 +33,6 @@ def time_series_to_frame(self): class SparseArrayConstructor(object): - goal_time = 0.2 params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, np.object]) param_names = ['dense_proportion', 'fill_value', 'dtype'] @@ -52,8 +47,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor(object): - goal_time = 0.2 - def setup(self): N = 1000 self.arr = np.arange(N) @@ -72,8 +65,6 @@ def time_from_dict(self): class FromCoo(object): - goal_time = 0.2 - def setup(self): self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), @@ -85,8 +76,6 @@ def time_sparse_series_from_coo(self): class ToCoo(object): - goal_time = 0.2 - def setup(self): s = Series([np.nan] * 10000) s[0] = 3.0 @@ -103,7 +92,6 @@ def time_sparse_series_to_coo(self): class Arithmetic(object): - goal_time = 0.2 params = ([0.1, 0.01], [0, np.nan]) param_names = ['dense_proportion', 'fill_value'] @@ -129,7 +117,6 @@ def time_divide(self, dense_proportion, fill_value): class ArithmeticBlock(object): - goal_time = 0.2 params = [np.nan, 0] param_names = ['fill_value'] @@ -160,3 +147,6 @@ def time_addition(self, fill_value): def time_division(self, fill_value): self.arr1 / self.arr2 + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index ecfcb27806f54..500e4d74d4c4f 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,8 +1,6 @@ import numpy as np import pandas as pd -from .pandas_vb_common import setup # noqa - ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem', 'var'] @@ -10,7 +8,6 @@ class FrameOps(object): - goal_time = 0.2 params = [ops, ['float', 'int'], [0, 1], [True, False]] param_names = ['op', 'dtype', 'axis', 'use_bottleneck'] @@ -29,16 +26,15 @@ def time_op(self, op, dtype, axis, use_bottleneck): class FrameMultiIndexOps(object): - goal_time = 0.2 params = ([0, 1, [0, 1]], ops) param_names = ['level', 'op'] def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - labels = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] - index = pd.MultiIndex(levels=levels, labels=labels) + codes = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, codes=codes) df = pd.DataFrame(np.random.randn(len(index), 4), index=index) self.df_func = getattr(df, op) @@ -48,7 +44,6 @@ def time_op(self, level, op): class SeriesOps(object): - goal_time = 0.2 params = [ops, ['float', 'int'], [True, False]] param_names = ['op', 'dtype', 'use_bottleneck'] @@ -67,16 +62,15 @@ def time_op(self, op, dtype, use_bottleneck): class SeriesMultiIndexOps(object): - goal_time = 0.2 params = ([0, 1, [0, 1]], ops) param_names = ['level', 'op'] def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - labels = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] - index = pd.MultiIndex(levels=levels, labels=labels) + codes = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, codes=codes) s = pd.Series(np.random.randn(len(index)), index=index) self.s_func = getattr(s, op) @@ -86,7 +80,6 @@ def time_op(self, level, op): class Rank(object): - goal_time = 0.2 params = [['DataFrame', 'Series'], [True, False]] param_names = ['constructor', 'pct'] @@ -103,12 +96,42 @@ def time_average_old(self, constructor, pct): class Correlation(object): - goal_time = 0.2 - params = ['spearman', 'kendall', 'pearson'] - param_names = ['method'] + params = [['spearman', 'kendall', 'pearson'], [True, False]] + param_names = ['method', 'use_bottleneck'] - def setup(self, method): + def setup(self, method, use_bottleneck): + try: + pd.options.compute.use_bottleneck = use_bottleneck + except TypeError: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) + self.s = pd.Series(np.random.randn(1000)) + self.s2 = pd.Series(np.random.randn(1000)) - def time_corr(self, method): + def time_corr(self, method, use_bottleneck): self.df.corr(method=method) + + def time_corr_series(self, method, use_bottleneck): + self.s.corr(self.s2, method=method) + + +class Covariance(object): + + params = [[True, False]] + param_names = ['use_bottleneck'] + + def setup(self, use_bottleneck): + try: + pd.options.compute.use_bottleneck = use_bottleneck + except TypeError: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.s = pd.Series(np.random.randn(100000)) + self.s2 = pd.Series(np.random.randn(100000)) + + def time_cov_series(self, use_bottleneck): + self.s.cov(self.s2) + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index ccfac2f73f14d..e9f2727f64e15 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -7,8 +7,6 @@ class Methods(object): - goal_time = 0.2 - def setup(self): self.s = Series(tm.makeStringIndex(10**5)) @@ -28,21 +26,42 @@ def time_extract(self): def time_findall(self): self.s.str.findall('[A-Z]+') + def time_find(self): + self.s.str.find('[A-Z]+') + + def time_rfind(self): + self.s.str.rfind('[A-Z]+') + def time_get(self): self.s.str.get(0) def time_len(self): self.s.str.len() + def time_join(self): + self.s.str.join(' ') + def time_match(self): self.s.str.match('A') + def time_normalize(self): + self.s.str.normalize('NFC') + def time_pad(self): self.s.str.pad(100, side='both') + def time_partition(self): + self.s.str.partition('A') + + def time_rpartition(self): + self.s.str.rpartition('A') + def time_replace(self): self.s.str.replace('A', '\x01\x01') + def time_translate(self): + self.s.str.translate({'A': '\x01\x01'}) + def time_slice(self): self.s.str.slice(5, 15, 2) @@ -67,10 +86,15 @@ def time_upper(self): def time_lower(self): self.s.str.lower() + def time_wrap(self): + self.s.str.wrap(10) + + def time_zfill(self): + self.s.str.zfill(10) + class Repeat(object): - goal_time = 0.2 params = ['int', 'array'] param_names = ['repeats'] @@ -86,7 +110,6 @@ def time_repeat(self, repeats): class Cat(object): - goal_time = 0.2 params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15]) param_names = ['other_cols', 'sep', 'na_rep', 'na_frac'] @@ -112,7 +135,6 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains(object): - goal_time = 0.2 params = [True, False] param_names = ['regex'] @@ -125,7 +147,6 @@ def time_contains(self, regex): class Split(object): - goal_time = 0.2 params = [True, False] param_names = ['expand'] @@ -135,10 +156,11 @@ def setup(self, expand): def time_split(self, expand): self.s.str.split('--', expand=expand) + def time_rsplit(self, expand): + self.s.str.rsplit('--', expand=expand) -class Dummies(object): - goal_time = 0.2 +class Dummies(object): def setup(self): self.s = Series(tm.makeStringIndex(10**5)).str.join('|') @@ -149,8 +171,6 @@ def time_get_dummies(self): class Encode(object): - goal_time = 0.2 - def setup(self): self.ser = Series(tm.makeUnicodeIndex()) @@ -160,8 +180,6 @@ def time_encode_decode(self): class Slice(object): - goal_time = 0.2 - def setup(self): self.s = Series(['abcdefg', np.nan] * 500000) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 3fe75b3c34299..7ee73fb7ac7b6 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,13 +1,12 @@ import datetime import numpy as np -from pandas import Series, timedelta_range, to_timedelta, Timestamp, Timedelta +from pandas import Series, timedelta_range, to_timedelta, Timestamp, \ + Timedelta, TimedeltaIndex, DataFrame class TimedeltaConstructor(object): - goal_time = 0.2 - def time_from_int(self): Timedelta(123456789) @@ -36,8 +35,6 @@ def time_from_missing(self): class ToTimedelta(object): - goal_time = 0.2 - def setup(self): self.ints = np.random.randint(0, 60, size=10000) self.str_days = [] @@ -58,7 +55,6 @@ def time_convert_string_seconds(self): class ToTimedeltaErrors(object): - goal_time = 0.2 params = ['coerce', 'ignore'] param_names = ['errors'] @@ -73,8 +69,6 @@ def time_convert(self, errors): class TimedeltaOps(object): - goal_time = 0.2 - def setup(self): self.td = to_timedelta(np.arange(1000000)) self.ts = Timestamp('2000') @@ -85,8 +79,6 @@ def time_add_td_ts(self): class TimedeltaProperties(object): - goal_time = 0.2 - def setup_cache(self): td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35) return td @@ -106,8 +98,6 @@ def time_timedelta_nanoseconds(self, td): class DatetimeAccessor(object): - goal_time = 0.2 - def setup_cache(self): N = 100000 series = Series(timedelta_range('1 days', periods=N, freq='h')) @@ -127,3 +117,36 @@ def time_timedelta_microseconds(self, series): def time_timedelta_nanoseconds(self, series): series.dt.nanoseconds + + +class TimedeltaIndexing(object): + + def setup(self): + self.index = TimedeltaIndex(start='1985', periods=1000, freq='D') + self.index2 = TimedeltaIndex(start='1986', periods=1000, freq='D') + self.series = Series(range(1000), index=self.index) + self.timedelta = self.index[500] + + def time_get_loc(self): + self.index.get_loc(self.timedelta) + + def time_shape(self): + self.index.shape + + def time_shallow_copy(self): + self.index._shallow_copy() + + def time_series_loc(self): + self.series.loc[self.timedelta] + + def time_align(self): + DataFrame({'a': self.series, 'b': self.series[:500]}) + + def time_intersection(self): + self.index.intersection(self.index2) + + def time_union(self): + self.index.union(self.index2) + + def time_unique(self): + self.index.unique() diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 2557ba7672a0e..58cda3b871e51 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -1,5 +1,6 @@ from datetime import timedelta +import dateutil import numpy as np from pandas import to_datetime, date_range, Series, DataFrame, period_range from pandas.tseries.frequencies import infer_freq @@ -8,12 +9,9 @@ except ImportError: from pandas.tseries.converter import DatetimeConverter -from .pandas_vb_common import setup # noqa - class DatetimeIndex(object): - goal_time = 0.2 params = ['dst', 'repeated', 'tz_aware', 'tz_naive'] param_names = ['index_type'] @@ -60,9 +58,10 @@ def time_to_pydatetime(self, index_type): class TzLocalize(object): - goal_time = 0.2 + params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()] + param_names = 'tz' - def setup(self): + def setup(self, tz): dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') self.index = date_range(start='10/29/2000', @@ -73,13 +72,12 @@ def setup(self): end='10/29/2000 3:00:00', freq='S')) - def time_infer_dst(self): - self.index.tz_localize('US/Eastern', ambiguous='infer') + def time_infer_dst(self, tz): + self.index.tz_localize(tz, ambiguous='infer') class ResetIndex(object): - goal_time = 0.2 params = [None, 'US/Eastern'] param_names = 'tz' @@ -93,7 +91,6 @@ def time_reest_datetimeindex(self, tz): class Factorize(object): - goal_time = 0.2 params = [None, 'Asia/Tokyo'] param_names = 'tz' @@ -108,7 +105,6 @@ def time_factorize(self, tz): class InferFreq(object): - goal_time = 0.2 params = [None, 'D', 'B'] param_names = ['freq'] @@ -125,8 +121,6 @@ def time_infer_freq(self, freq): class TimeDatetimeConverter(object): - goal_time = 0.2 - def setup(self): N = 100000 self.rng = date_range(start='1/1/2000', periods=N, freq='T') @@ -137,7 +131,6 @@ def time_convert(self): class Iteration(object): - goal_time = 0.2 params = [date_range, period_range] param_names = ['time_index'] @@ -158,7 +151,6 @@ def time_iter_preexit(self, time_index): class ResampleDataFrame(object): - goal_time = 0.2 params = ['max', 'mean', 'min'] param_names = ['method'] @@ -173,7 +165,6 @@ def time_method(self, method): class ResampleSeries(object): - goal_time = 0.2 params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc']) param_names = ['index', 'freq', 'method'] @@ -194,8 +185,6 @@ def time_resample(self, index, freq, method): class ResampleDatetetime64(object): # GH 7754 - goal_time = 0.2 - def setup(self): rng3 = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') @@ -207,7 +196,6 @@ def time_resample(self): class AsOf(object): - goal_time = 0.2 params = ['DataFrame', 'Series'] param_names = ['constructor'] @@ -255,7 +243,6 @@ def time_asof_nan_single(self, constructor): class SortIndex(object): - goal_time = 0.2 params = [True, False] param_names = ['monotonic'] @@ -275,8 +262,6 @@ def time_get_slice(self, monotonic): class IrregularOps(object): - goal_time = 0.2 - def setup(self): N = 10**5 idx = date_range(start='1/1/2000', periods=N, freq='s') @@ -290,8 +275,6 @@ def time_add(self): class Lookup(object): - goal_time = 0.2 - def setup(self): N = 1500000 rng = date_range(start='1/1/2000', periods=N, freq='S') @@ -305,8 +288,6 @@ def time_lookup_and_cleanup(self): class ToDatetimeYYYYMMDD(object): - goal_time = 0.2 - def setup(self): rng = date_range(start='1/1/2000', periods=10000, freq='D') self.stringsD = Series(rng.strftime('%Y%m%d')) @@ -317,8 +298,6 @@ def time_format_YYYYMMDD(self): class ToDatetimeISO8601(object): - goal_time = 0.2 - def setup(self): rng = date_range(start='1/1/2000', periods=20000, freq='H') self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist() @@ -344,8 +323,6 @@ def time_iso8601_tz_spaceformat(self): class ToDatetimeNONISO8601(object): - goal_time = 0.2 - def setup(self): N = 10000 half = int(N / 2) @@ -363,8 +340,6 @@ def time_different_offset(self): class ToDatetimeFormat(object): - goal_time = 0.2 - def setup(self): self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000) self.s2 = self.s.str.replace(':\\S+$', '') @@ -378,7 +353,6 @@ def time_no_exact(self): class ToDatetimeCache(object): - goal_time = 0.2 params = [True, False] param_names = ['cache'] @@ -407,12 +381,35 @@ def time_dup_string_tzoffset_dates(self, cache): class DatetimeAccessor(object): - def setup(self): + params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()] + param_names = 'tz' + + def setup(self, tz): N = 100000 - self.series = Series(date_range(start='1/1/2000', periods=N, freq='T')) + self.series = Series( + date_range(start='1/1/2000', periods=N, freq='T', tz=tz) + ) - def time_dt_accessor(self): + def time_dt_accessor(self, tz): self.series.dt - def time_dt_accessor_normalize(self): + def time_dt_accessor_normalize(self, tz): self.series.dt.normalize() + + def time_dt_accessor_month_name(self, tz): + self.series.dt.month_name() + + def time_dt_accessor_day_name(self, tz): + self.series.dt.day_name() + + def time_dt_accessor_time(self, tz): + self.series.dt.time + + def time_dt_accessor_date(self, tz): + self.series.dt.date + + def time_dt_accessor_year(self, tz): + self.series.dt.year + + +from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index c142a9b59fc43..64f46fe378e53 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -2,6 +2,7 @@ from pandas import Timestamp import pytz +import dateutil class TimestampConstruction(object): @@ -29,9 +30,8 @@ def time_fromtimestamp(self): class TimestampProperties(object): - goal_time = 0.2 - - _tzs = [None, pytz.timezone('Europe/Amsterdam')] + _tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC, + dateutil.tz.tzutc()] _freqs = [None, 'B'] params = [_tzs, _freqs] param_names = ['tz', 'freq'] @@ -89,9 +89,8 @@ def time_microsecond(self, tz, freq): class TimestampOps(object): - goal_time = 0.2 - - params = [None, 'US/Eastern'] + params = [None, 'US/Eastern', pytz.UTC, + dateutil.tz.tzutc()] param_names = ['tz'] def setup(self, tz): @@ -106,10 +105,19 @@ def time_replace_None(self, tz): def time_to_pydatetime(self, tz): self.ts.to_pydatetime() + def time_normalize(self, tz): + self.ts.normalize() -class TimestampAcrossDst(object): - goal_time = 0.2 + def time_tz_convert(self, tz): + if self.ts.tz is not None: + self.ts.tz_convert(tz) + def time_tz_localize(self, tz): + if self.ts.tz is None: + self.ts.tz_localize(tz) + + +class TimestampAcrossDst(object): def setup(self): dt = datetime.datetime(2016, 3, 27, 1) self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5d473bfc5a38c..409b1ac8c9df3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,10 +7,10 @@ jobs: parameters: name: macOS vmImage: xcode9-macos10.13 -# - template: ci/azure/linux.yml -# parameters: -# name: Linux -# vmImage: ubuntu-16.04 +- template: ci/azure/linux.yml + parameters: + name: Linux + vmImage: ubuntu-16.04 # Windows Python 2.7 needs VC 9.0 installed, and not sure # how to make that a conditional task, so for now these are @@ -23,3 +23,104 @@ jobs: parameters: name: WindowsPy27 vmImage: vs2017-win2016 + +- job: 'Checks_and_doc' + pool: + vmImage: ubuntu-16.04 + timeoutInMinutes: 90 + steps: + - script: | + # XXX next command should avoid redefining the path in every step, but + # made the process crash as it couldn't find deactivate + #echo '##vso[task.prependpath]$HOME/miniconda3/bin' + echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev' + echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' + echo '##vso[task.setvariable variable=AZURE]true' + displayName: 'Setting environment variables' + + # Do not require a conda environment + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + ci/code_checks.sh patterns + displayName: 'Looking for unwanted patterns' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + sudo apt-get install -y libc6-dev-i386 + ci/incremental/install_miniconda.sh + ci/incremental/setup_conda_environment.sh + displayName: 'Set up environment' + + # Do not require pandas + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh lint + displayName: 'Linting' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh dependencies + displayName: 'Dependencies consistency' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/incremental/build.sh + displayName: 'Build' + condition: true + + # Require pandas + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh code + displayName: 'Checks on imported code' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh doctests + displayName: 'Running doctests' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh docstrings + displayName: 'Docstring validation' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + pytest --capture=no --strict scripts + displayName: 'Testing docstring validaton script' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + if git diff upstream/master --name-only | grep -q "^asv_bench/"; then + cd asv_bench + asv machine --yes + ASV_OUTPUT="$(asv dev)" + if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then + echo "##vso[task.logissue type=error]Benchmarks run with errors" + echo "$ASV_OUTPUT" + exit 1 + else + echo "Benchmarks run without errors" + fi + else + echo "Benchmarks did not run, no changes detected" + fi + displayName: 'Running benchmarks' + condition: true diff --git a/ci/README.txt b/ci/README.txt deleted file mode 100644 index bb71dc25d6093..0000000000000 --- a/ci/README.txt +++ /dev/null @@ -1,17 +0,0 @@ -Travis is a ci service that's well-integrated with GitHub. -The following types of breakage should be detected -by Travis builds: - -1) Failing tests on any supported version of Python. -2) Pandas should install and the tests should run if no optional deps are installed. -That also means tests which rely on optional deps need to raise SkipTest() -if the dep is missing. -3) unicode related fails when running under exotic locales. - -We tried running the vbench suite for a while, but with varying load -on Travis machines, that wasn't useful. - -Travis currently (4/2013) has a 5-job concurrency limit. Exceeding it -basically doubles the total runtime for a commit through travis, and -since dep+pandas installation is already quite long, this should become -a hard limit on concurrent travis runs. diff --git a/ci/azure/linux.yml b/ci/azure/linux.yml new file mode 100644 index 0000000000000..fe64307e9d08f --- /dev/null +++ b/ci/azure/linux.yml @@ -0,0 +1,79 @@ +parameters: + name: '' + vmImage: '' + +jobs: +- job: ${{ parameters.name }} + pool: + vmImage: ${{ parameters.vmImage }} + strategy: + maxParallel: 11 + matrix: + py27_np_120: + ENV_FILE: ci/deps/azure-27-compat.yaml + CONDA_PY: "27" + PATTERN: "not slow and not network" + + py37_locale: + ENV_FILE: ci/deps/azure-37-locale.yaml + CONDA_PY: "37" + PATTERN: "not slow and not network" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + + py36_locale_slow: + ENV_FILE: ci/deps/azure-36-locale_slow.yaml + CONDA_PY: "36" + PATTERN: "not slow and not network" + LOCALE_OVERRIDE: "it_IT.UTF-8" + + steps: + - script: | + if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386; fi + echo "Installing Miniconda"{ + ci/incremental/install_miniconda.sh + export PATH=$HOME/miniconda3/bin:$PATH + echo "Setting up Conda environment" + ci/incremental/setup_conda_environment.sh + displayName: 'Before Install' + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/incremental/build.sh + displayName: 'Build' + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/run_tests.sh + displayName: 'Test' + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - task: PublishTestResults@2 + inputs: + testResultsFiles: 'test-data-*.xml' + testRunTitle: 'Linux' + - powershell: | + $junitXml = "test-data-single.xml" + $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) + { + Write-Host "No test failures in test-data-single" + } + else + { + # note that this will produce $LASTEXITCODE=1 + Write-Error "$($matches[1]) tests failed" + } + + $junitXmlMulti = "test-data-multiple.xml" + $(Get-Content $junitXmlMulti | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) + { + Write-Host "No test failures in test-data-multi" + } + else + { + # note that this will produce $LASTEXITCODE=1 + Write-Error "$($matches[1]) tests failed" + } + displayName: Check for test failures diff --git a/ci/azure/macos.yml b/ci/azure/macos.yml index 5bf8d18d6cbb9..98409576a5a87 100644 --- a/ci/azure/macos.yml +++ b/ci/azure/macos.yml @@ -9,11 +9,10 @@ jobs: strategy: maxParallel: 11 matrix: - py35_np_110: - ENV_FILE: ci/azure-macos-35.yml + py35_np_120: + ENV_FILE: ci/deps/azure-macos-35.yaml CONDA_PY: "35" - CONDA_ENV: pandas - TEST_ARGS: "--skip-slow --skip-network" + PATTERN: "not slow and not network" steps: - script: | @@ -26,18 +25,43 @@ jobs: displayName: 'Before Install' - script: | export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev ci/incremental/build.sh displayName: 'Build' - script: | export PATH=$HOME/miniconda3/bin:$PATH - ci/script_single.sh - ci/script_multi.sh - echo "[Test done]" + source activate pandas-dev + ci/run_tests.sh displayName: 'Test' - script: | export PATH=$HOME/miniconda3/bin:$PATH - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - task: PublishTestResults@2 inputs: - testResultsFiles: '/tmp/*.xml' + testResultsFiles: 'test-data-*.xml' testRunTitle: 'MacOS-35' + - powershell: | + $junitXml = "test-data-single.xml" + $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) + { + Write-Host "No test failures in test-data-single" + } + else + { + # note that this will produce $LASTEXITCODE=1 + Write-Error "$($matches[1]) tests failed" + } + + $junitXmlMulti = "test-data-multiple.xml" + $(Get-Content $junitXmlMulti | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) + { + Write-Host "No test failures in test-data-multi" + } + else + { + # note that this will produce $LASTEXITCODE=1 + Write-Error "$($matches[1]) tests failed" + } + displayName: Check for test failures diff --git a/ci/azure/windows-py27.yml b/ci/azure/windows-py27.yml index 3e92c96263930..0d9aea816c4ad 100644 --- a/ci/azure/windows-py27.yml +++ b/ci/azure/windows-py27.yml @@ -9,10 +9,9 @@ jobs: strategy: maxParallel: 11 matrix: - py36_np14: - ENV_FILE: ci/azure-windows-27.yml + py36_np121: + ENV_FILE: ci/deps/azure-windows-27.yaml CONDA_PY: "27" - CONDA_ENV: pandas steps: - task: CondaEnvironment@1 @@ -33,13 +32,27 @@ jobs: ci\\incremental\\setup_conda_environment.cmd displayName: 'Before Install' - script: | + call activate pandas-dev ci\\incremental\\build.cmd displayName: 'Build' - script: | - call activate %CONDA_ENV% - pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict %* + call activate pandas-dev + pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* displayName: 'Test' - task: PublishTestResults@2 inputs: testResultsFiles: 'test-data.xml' testRunTitle: 'Windows 27' + - powershell: | + $junitXml = "test-data.xml" + $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) + { + Write-Host "No test failures in test-data" + } + else + { + # note that this will produce $LASTEXITCODE=1 + Write-Error "$($matches[1]) tests failed" + } + displayName: Check for test failures diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 2ab8c6f320188..b69c210ca27ba 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -10,9 +10,8 @@ jobs: maxParallel: 11 matrix: py36_np14: - ENV_FILE: ci/azure-windows-36.yml + ENV_FILE: ci/deps/azure-windows-36.yaml CONDA_PY: "36" - CONDA_ENV: pandas steps: - task: CondaEnvironment@1 @@ -24,13 +23,27 @@ jobs: ci\\incremental\\setup_conda_environment.cmd displayName: 'Before Install' - script: | + call activate pandas-dev ci\\incremental\\build.cmd displayName: 'Build' - script: | - call activate %CONDA_ENV% - pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict %* + call activate pandas-dev + pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* displayName: 'Test' - task: PublishTestResults@2 inputs: testResultsFiles: 'test-data.xml' testRunTitle: 'Windows 36' + - powershell: | + $junitXml = "test-data.xml" + $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) + { + Write-Host "No test failures in test-data" + } + else + { + # note that this will produce $LASTEXITCODE=1 + Write-Error "$($matches[1]) tests failed" + } + displayName: Check for test failures diff --git a/ci/build_docs.sh b/ci/build_docs.sh index f445447e3565c..f89c4369dff4a 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -5,19 +5,13 @@ if [ "${TRAVIS_OS_NAME}" != "linux" ]; then exit 0 fi -cd "$TRAVIS_BUILD_DIR" +cd "$TRAVIS_BUILD_DIR"/doc echo "inside $0" if [ "$DOC" ]; then echo "Will build docs" - source activate pandas - - mv "$TRAVIS_BUILD_DIR"/doc /tmp - mv "$TRAVIS_BUILD_DIR/LICENSE" /tmp # included in the docs. - cd /tmp/doc - echo ############################### echo # Log file for the doc build # echo ############################### @@ -29,7 +23,7 @@ if [ "$DOC" ]; then echo # Create and send docs # echo ######################## - cd /tmp/doc/build/html + cd build/html git config --global user.email "pandas-docs-bot@localhost.foo" git config --global user.name "pandas-docs-bot" diff --git a/ci/check_imports.py b/ci/check_imports.py deleted file mode 100644 index 19e48b659617f..0000000000000 --- a/ci/check_imports.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Check that certain modules are not loaded by `import pandas` -""" -import sys - -blacklist = { - 'bs4', - 'gcsfs', - 'html5lib', - 'ipython', - 'jinja2' - 'hypothesis', - 'lxml', - 'numexpr', - 'openpyxl', - 'py', - 'pytest', - 's3fs', - 'scipy', - 'tables', - 'xlrd', - 'xlsxwriter', - 'xlwt', -} - - -def main(): - import pandas # noqa - - modules = set(x.split('.')[0] for x in sys.modules) - imported = modules & blacklist - if modules & blacklist: - sys.exit("Imported {}".format(imported)) - - -if __name__ == '__main__': - main() diff --git a/ci/circle-35-ascii.yaml b/ci/circle-35-ascii.yaml deleted file mode 100644 index 281ed59e2deff..0000000000000 --- a/ci/circle-35-ascii.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: pandas -channels: - - defaults -dependencies: - - cython>=0.28.2 - - nomkl - - numpy - - python-dateutil - - python=3.5* - - pytz - # universal - - pytest - - pytest-xdist - - pip: - - hypothesis>=3.58.0 diff --git a/ci/install_circle.sh b/ci/circle/install_circle.sh similarity index 95% rename from ci/install_circle.sh rename to ci/circle/install_circle.sh index f8bcf6bcffc99..0918e8790fca2 100755 --- a/ci/install_circle.sh +++ b/ci/circle/install_circle.sh @@ -60,9 +60,9 @@ fi # create envbuild deps echo "[create env]" -time conda env create -q -n pandas --file="${ENV_FILE}" || exit 1 +time conda env create -q --file="${ENV_FILE}" || exit 1 -source activate pandas +source activate pandas-dev # remove any installed pandas package # w/o removing anything else diff --git a/ci/code_checks.sh b/ci/code_checks.sh new file mode 100755 index 0000000000000..953547f72d3e1 --- /dev/null +++ b/ci/code_checks.sh @@ -0,0 +1,227 @@ +#!/bin/bash +# +# Run checks related to code quality. +# +# This script is intended for both the CI and to check locally that code standards are +# respected. We are currently linting (PEP-8 and similar), looking for patterns of +# common mistakes (sphinx directives with missing blank lines, old style classes, +# unwanted imports...), we run doctests here (currently some files only), and we +# validate formatting error in docstrings. +# +# Usage: +# $ ./ci/code_checks.sh # run all checks +# $ ./ci/code_checks.sh lint # run linting only +# $ ./ci/code_checks.sh patterns # check for patterns that should not exist +# $ ./ci/code_checks.sh code # checks on imported code +# $ ./ci/code_checks.sh doctests # run doctests +# $ ./ci/code_checks.sh docstrings # validate docstring errors +# $ ./ci/code_checks.sh dependencies # check that dependencies are consistent + +[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "dependencies" ]] || \ + { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|dependencies]"; exit 9999; } + +BASE_DIR="$(dirname $0)/.." +RET=0 +CHECK=$1 + +function invgrep { + # grep with inverse exist status and formatting for azure-pipelines + # + # This function works exactly as grep, but with opposite exit status: + # - 0 (success) when no patterns are found + # - 1 (fail) when the patterns are found + # + # This is useful for the CI, as we want to fail if one of the patterns + # that we want to avoid is found by grep. + if [[ "$AZURE" == "true" ]]; then + set -o pipefail + grep -n "$@" | awk -F ":" '{print "##vso[task.logissue type=error;sourcepath=" $1 ";linenumber=" $2 ";] Found unwanted pattern: " $3}' + else + grep "$@" + fi + return $((! $?)) +} + +if [[ "$AZURE" == "true" ]]; then + FLAKE8_FORMAT="##vso[task.logissue type=error;sourcepath=%(path)s;linenumber=%(row)s;columnnumber=%(col)s;code=%(code)s;]%(text)s" +else + FLAKE8_FORMAT="default" +fi + +### LINTING ### +if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then + + # `setup.cfg` contains the list of error codes that are being ignored in flake8 + + echo "flake8 --version" + flake8 --version + + # pandas/_libs/src is C code, so no need to search there. + MSG='Linting .py code' ; echo $MSG + flake8 --format="$FLAKE8_FORMAT" . + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Linting .pyx code' ; echo $MSG + flake8 --format="$FLAKE8_FORMAT" pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403,C400,C401,C402,C403,C404,C405,C406,C407,C408,C409,C410,C411 + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Linting .pxd and .pxi.in' ; echo $MSG + flake8 --format="$FLAKE8_FORMAT" pandas/_libs --filename=*.pxi.in,*.pxd --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 + RET=$(($RET + $?)) ; echo $MSG "DONE" + + echo "flake8-rst --version" + flake8-rst --version + + MSG='Linting code-blocks in .rst documentation' ; echo $MSG + flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT" + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # Check that cython casting is of the form `obj` as opposed to ` obj`; + # it doesn't make a difference, but we want to be internally consistent. + # Note: this grep pattern is (intended to be) equivalent to the python + # regex r'(?])> ' + MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG + invgrep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # readability/casting: Warnings about C casting instead of C++ casting + # runtime/int: Warnings about using C number types instead of C++ ones + # build/include_subdir: Warnings about prefacing included header files with directory + + # We don't lint all C files because we don't want to lint any that are built + # from Cython files nor do we want to lint C files that we didn't modify for + # this particular codebase (e.g. src/headers, src/klib, src/msgpack). However, + # we can lint all header files since they aren't "generated" like C files are. + MSG='Linting .c and .h' ; echo $MSG + cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime + RET=$(($RET + $?)) ; echo $MSG "DONE" + + echo "isort --version-number" + isort --version-number + + # Imports - Check formatting using isort see setup.cfg for settings + MSG='Check import format using isort ' ; echo $MSG + isort --recursive --check-only pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + +### PATTERNS ### +if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then + + # Check for imports from pandas.core.common instead of `import pandas.core.common as com` + MSG='Check for non-standard imports' ; echo $MSG + invgrep -R --include="*.py*" -E "from pandas.core.common import " pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for pytest warns' ; echo $MSG + invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # Check for the following code in testing: `np.testing` and `np.array_equal` + MSG='Check for invalid testing' ; echo $MSG + invgrep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # Check for the following code in the extension array base tests: `tm.assert_frame_equal` and `tm.assert_series_equal` + MSG='Check for invalid EA testing' ; echo $MSG + invgrep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for deprecated messages without sphinx directive' ; echo $MSG + invgrep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for old-style classes' ; echo $MSG + invgrep -R --include="*.py" -E "class\s\S*[^)]:" pandas scripts + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG + invgrep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for incorrect sphinx directives' ; echo $MSG + invgrep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. (autosummary|contents|currentmodule|deprecated|function|image|important|include|ipython|literalinclude|math|module|note|raw|seealso|toctree|versionadded|versionchanged|warning):[^:]" ./pandas ./doc/source + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check that the deprecated `assert_raises_regex` is not used (`pytest.raises(match=pattern)` should be used instead)' ; echo $MSG + invgrep -R --exclude=*.pyc --exclude=testing.py --exclude=test_testing.py assert_raises_regex pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + +### CODE ### +if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then + + MSG='Check import. No warnings, and blacklist some optional dependencies' ; echo $MSG + python -W error -c " +import sys +import pandas + +blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis', + 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', + 'tables', 'xlrd', 'xlsxwriter', 'xlwt'} +mods = blacklist & set(m.split('.')[0] for m in sys.modules) +if mods: + sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) + sys.exit(len(mods)) + " + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + +### DOCTESTS ### +if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then + + MSG='Doctests frame.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/frame.py \ + -k"-axes -combine -itertuples -join -pivot_table -quantile -query -reindex -reindex_axis -round" + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests series.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/series.py \ + -k"-nonzero -reindex -searchsorted -to_dict" + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests generic.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/generic.py \ + -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard" + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests top-level reshaping functions' ; echo $MSG + pytest -q --doctest-modules \ + pandas/core/reshape/concat.py \ + pandas/core/reshape/pivot.py \ + pandas/core/reshape/reshape.py \ + pandas/core/reshape/tile.py \ + -k"-crosstab -pivot_table -cut" + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests interval classes' ; echo $MSG + pytest --doctest-modules -v \ + pandas/core/indexes/interval.py \ + pandas/core/arrays/interval.py \ + -k"-from_arrays -from_breaks -from_intervals -from_tuples -get_loc -set_closed -to_tuples -interval_range" + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + +### DOCSTRINGS ### +if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then + + MSG='Validate docstrings (GL06, SS04, PR03, PR05, EX04)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL06,SS04,PR03,PR05,EX04 + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + +### DEPENDENCIES ### +if [[ -z "$CHECK" || "$CHECK" == "dependencies" ]]; then + + MSG='Check that requirements-dev.txt has been generated from environment.yml' ; echo $MSG + $BASE_DIR/scripts/generate_pip_deps_from_conda.py --compare --azure + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + +exit $RET diff --git a/ci/circle-27-compat.yaml b/ci/deps/azure-27-compat.yaml similarity index 67% rename from ci/circle-27-compat.yaml rename to ci/deps/azure-27-compat.yaml index 84ec7e20fc8f1..f3cc615c35243 100644 --- a/ci/circle-27-compat.yaml +++ b/ci/deps/azure-27-compat.yaml @@ -1,22 +1,22 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge dependencies: - - bottleneck=1.0.0 + - bottleneck=1.2.0 - cython=0.28.2 - jinja2=2.8 - - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr - - numpy=1.9.3 + - numexpr=2.6.1 + - numpy=1.12.0 - openpyxl=2.5.5 - psycopg2 - - pytables=3.2.2 + - pytables=3.4.2 - python-dateutil=2.5.0 - python=2.7* - pytz=2013b - - scipy=0.14.0 + - scipy=0.18.1 - sqlalchemy=0.7.8 - - xlrd=0.9.2 + - xlrd=1.0.0 - xlsxwriter=0.5.2 - xlwt=0.7.5 # universal diff --git a/ci/circle-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml similarity index 93% rename from ci/circle-36-locale_slow.yaml rename to ci/deps/azure-36-locale_slow.yaml index 14b23dd6f3e4c..4bbc6a2c11f1e 100644 --- a/ci/circle-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge @@ -14,7 +14,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - psycopg2 - pymysql - pytables diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml new file mode 100644 index 0000000000000..11a698ce7648e --- /dev/null +++ b/ci/deps/azure-37-locale.yaml @@ -0,0 +1,35 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython>=0.28.2 + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.7* + - pytz + - s3fs + - scipy + - sqlalchemy + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - pip: + - hypothesis>=3.58.0 + - moto # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed diff --git a/ci/azure-macos-35.yml b/ci/deps/azure-macos-35.yaml similarity index 86% rename from ci/azure-macos-35.yml rename to ci/deps/azure-macos-35.yaml index a36f748ded812..7a0c3b81ac8f9 100644 --- a/ci/azure-macos-35.yml +++ b/ci/deps/azure-macos-35.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults dependencies: @@ -8,10 +8,10 @@ dependencies: - html5lib - jinja2 - lxml - - matplotlib + - matplotlib=2.2.0 - nomkl - numexpr - - numpy=1.10.4 + - numpy=1.12.0 - openpyxl=2.5.5 - pytables - python=3.5* diff --git a/ci/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml similarity index 88% rename from ci/azure-windows-27.yaml rename to ci/deps/azure-windows-27.yaml index bcd9ddee1715e..b1533b071fa74 100644 --- a/ci/azure-windows-27.yaml +++ b/ci/deps/azure-windows-27.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge @@ -10,10 +10,10 @@ dependencies: - html5lib - jinja2=2.8 - lxml - - matplotlib + - matplotlib=2.0.1 - numexpr - numpy=1.12* - - openpyxl=2.5.5 + - openpyxl - pytables - python=2.7.* - pytz diff --git a/ci/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml similarity index 80% rename from ci/azure-windows-36.yaml rename to ci/deps/azure-windows-36.yaml index 6230e9b6a1885..817aab66c65aa 100644 --- a/ci/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -1,20 +1,21 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge dependencies: - blosc - bottleneck + - boost-cpp<1.67 - fastparquet - - feather-format - matplotlib - numexpr - numpy=1.14* - - openpyxl=2.5.5 + - openpyxl + - parquet-cpp - pyarrow - pytables - python-dateutil - - python=3.6.* + - python=3.6.6 - pytz - scipy - thrift=0.10* diff --git a/ci/circle-36-locale.yaml b/ci/deps/circle-36-locale.yaml similarity index 93% rename from ci/circle-36-locale.yaml rename to ci/deps/circle-36-locale.yaml index ef97b85406709..2b38465c04512 100644 --- a/ci/circle-36-locale.yaml +++ b/ci/deps/circle-36-locale.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - psycopg2 - pymysql - pytables diff --git a/ci/travis-27-locale.yaml b/ci/deps/travis-27-locale.yaml similarity index 80% rename from ci/travis-27-locale.yaml rename to ci/deps/travis-27-locale.yaml index aca65f27d4187..0846ef5e8264e 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/deps/travis-27-locale.yaml @@ -1,13 +1,13 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge dependencies: - - bottleneck=1.0.0 + - bottleneck=1.2.0 - cython=0.28.2 - lxml - - matplotlib=1.4.3 - - numpy=1.9.3 + - matplotlib=2.0.0 + - numpy=1.12.0 - openpyxl=2.4.0 - python-dateutil - python-blosc @@ -16,7 +16,7 @@ dependencies: - pytz=2013b - scipy - sqlalchemy=0.8.1 - - xlrd=0.9.2 + - xlrd=1.0.0 - xlsxwriter=0.5.2 - xlwt=0.7.5 # universal diff --git a/ci/travis-27.yaml b/ci/deps/travis-27.yaml similarity index 80% rename from ci/travis-27.yaml rename to ci/deps/travis-27.yaml index 6955db363ca1f..8d14673ebde6d 100644 --- a/ci/travis-27.yaml +++ b/ci/deps/travis-27.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge @@ -7,16 +7,13 @@ dependencies: - bottleneck - cython=0.28.2 - fastparquet - - feather-format - - flake8=3.4.1 - - flake8-comprehensions - gcsfs - html5lib - ipython - jemalloc=4.5.0.post - jinja2=2.8 - lxml - - matplotlib + - matplotlib=2.2.2 - mock - nomkl - numexpr @@ -25,7 +22,7 @@ dependencies: - patsy - psycopg2 - py - - pyarrow=0.4.1 + - pyarrow=0.7.0 - PyCrypto - pymysql=0.6.3 - pytables @@ -37,8 +34,8 @@ dependencies: - s3fs - scipy - sqlalchemy=0.9.6 - - xarray=0.8.0 - - xlrd=0.9.2 + - xarray=0.9.6 + - xlrd=1.0.0 - xlsxwriter=0.5.2 - xlwt=0.7.5 # universal @@ -48,6 +45,5 @@ dependencies: - hypothesis>=3.58.0 - pip: - backports.lzma - - cpplint - pandas-gbq - pathlib diff --git a/ci/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml similarity index 83% rename from ci/travis-36-doc.yaml rename to ci/deps/travis-36-doc.yaml index 8353659e7b9a9..ed0764fab414a 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/deps/travis-36-doc.yaml @@ -1,18 +1,17 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge - - r dependencies: - beautifulsoup4 - bottleneck - cython>=0.28.2 - fastparquet - - feather-format + - gitpython - html5lib - hypothesis>=3.58.0 - ipykernel - - ipython==6.5.0 + - ipython - ipywidgets - lxml - matplotlib @@ -22,22 +21,20 @@ dependencies: - notebook - numexpr - numpy=1.13* - - openpyxl=2.5.5 + - openpyxl - pandoc + - pyarrow - pyqt - pytables - python-dateutil - python-snappy - python=3.6* - pytz - - r - - rpy2 - scipy - seaborn - sphinx - sqlalchemy - statsmodels - - tzlocal - xarray - xlrd - xlsxwriter diff --git a/ci/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml similarity index 92% rename from ci/travis-36-slow.yaml rename to ci/deps/travis-36-slow.yaml index 1a7bc53e1b74b..a6ffdb95e5e7c 100644 --- a/ci/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - patsy - psycopg2 - pymysql diff --git a/ci/travis-36.yaml b/ci/deps/travis-36.yaml similarity index 84% rename from ci/travis-36.yaml rename to ci/deps/travis-36.yaml index 3c9daa5f8b73c..bfd69652730ed 100644 --- a/ci/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge @@ -7,20 +7,16 @@ dependencies: - cython>=0.28.2 - dask - fastparquet - - feather-format - gcsfs - geopandas - html5lib - - ipython - - jinja2 - - lxml - matplotlib - nomkl - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - psycopg2 - - pyarrow + - pyarrow=0.9.0 - pymysql - pytables - python-snappy @@ -29,7 +25,6 @@ dependencies: - s3fs - scikit-learn - scipy - - seaborn - sqlalchemy - statsmodels - xarray diff --git a/ci/travis-37-numpydev.yaml b/ci/deps/travis-37-numpydev.yaml similarity index 95% rename from ci/travis-37-numpydev.yaml rename to ci/deps/travis-37-numpydev.yaml index 82c75b7c91b1f..99ae228f25de3 100644 --- a/ci/travis-37-numpydev.yaml +++ b/ci/deps/travis-37-numpydev.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults dependencies: diff --git a/ci/travis-37.yaml b/ci/deps/travis-37.yaml similarity index 87% rename from ci/travis-37.yaml rename to ci/deps/travis-37.yaml index 4f2138d8555e3..a297786f6b14d 100644 --- a/ci/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge @@ -9,6 +9,7 @@ dependencies: - numpy - python-dateutil - nomkl + - pyarrow - pytz - pytest - pytest-xdist diff --git a/ci/doctests.sh b/ci/doctests.sh deleted file mode 100755 index 16b3430f1e431..0000000000000 --- a/ci/doctests.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -echo "inside $0" - - -source activate pandas -cd "$TRAVIS_BUILD_DIR" - -RET=0 - -if [ "$DOCTEST" ]; then - - echo "Running doctests" - - # running all doctests is not yet working - # pytest --doctest-modules --ignore=pandas/tests -v pandas - - # if [ $? -ne "0" ]; then - # RET=1 - # fi - - # DataFrame / Series docstrings - pytest --doctest-modules -v pandas/core/frame.py \ - -k"-axes -combine -itertuples -join -nlargest -nsmallest -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_dict -to_stata" - - if [ $? -ne "0" ]; then - RET=1 - fi - - pytest --doctest-modules -v pandas/core/series.py \ - -k"-nonzero -reindex -searchsorted -to_dict" - - if [ $? -ne "0" ]; then - RET=1 - fi - - pytest --doctest-modules -v pandas/core/generic.py \ - -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -to_json -transpose -values -xs" - - if [ $? -ne "0" ]; then - RET=1 - fi - - # top-level reshaping functions - pytest --doctest-modules -v \ - pandas/core/reshape/concat.py \ - pandas/core/reshape/pivot.py \ - pandas/core/reshape/reshape.py \ - pandas/core/reshape/tile.py \ - -k"-crosstab -pivot_table -cut" - - if [ $? -ne "0" ]; then - RET=1 - fi - -else - echo "NOT running doctests" -fi - -exit $RET diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml deleted file mode 100644 index f3323face4144..0000000000000 --- a/ci/environment-dev.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - Cython>=0.28.2 - - NumPy - - flake8 - - flake8-comprehensions - - hypothesis>=3.58.0 - - moto - - pytest>=3.6 - - python-dateutil>=2.5.0 - - python=3 - - pytz - - setuptools>=24.2.0 - - sphinx - - sphinxcontrib-spelling diff --git a/ci/incremental/build.cmd b/ci/incremental/build.cmd index d2fd06d7d9e50..2cce38c03f406 100644 --- a/ci/incremental/build.cmd +++ b/ci/incremental/build.cmd @@ -1,5 +1,4 @@ @rem https://github.com/numba/numba/blob/master/buildscripts/incremental/build.cmd -call activate %CONDA_ENV% @rem Build numba extensions without silencing compile errors python setup.py build_ext -q --inplace diff --git a/ci/incremental/build.sh b/ci/incremental/build.sh index 8f2301a3b7ef5..05648037935a3 100755 --- a/ci/incremental/build.sh +++ b/ci/incremental/build.sh @@ -1,7 +1,5 @@ #!/bin/bash -source activate $CONDA_ENV - # Make sure any error below is reported as such set -v -e diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index b4446c49fabd3..c104d78591384 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -11,11 +11,11 @@ call deactivate @rem Display root environment (for debugging) conda list @rem Clean up any left-over from a previous build -conda remove --all -q -y -n %CONDA_ENV% +conda remove --all -q -y -n pandas-dev @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite -conda env create -n %CONDA_ENV% --file=ci\azure-windows-%CONDA_PY%.yaml +conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml -call activate %CONDA_ENV% +call activate pandas-dev conda list if %errorlevel% neq 0 exit /b %errorlevel% diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index c716a39138644..f174c17a614d8 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -5,6 +5,7 @@ set -v -e CONDA_INSTALL="conda install -q -y" PIP_INSTALL="pip install -q" + # Deactivate any environment source deactivate # Display root environment (for debugging) @@ -12,28 +13,31 @@ conda list # Clean up any left-over from a previous build # (note workaround for https://github.com/conda/conda/issues/2679: # `conda env remove` issue) -conda remove --all -q -y -n $CONDA_ENV +conda remove --all -q -y -n pandas-dev echo echo "[create env]" -time conda env create -q -n "${CONDA_ENV}" --file="${ENV_FILE}" || exit 1 +time conda env create -q --file="${ENV_FILE}" || exit 1 -# Activate first set +v -source activate $CONDA_ENV +source activate pandas-dev set -v # remove any installed pandas package # w/o removing anything else echo echo "[removing installed pandas]" -conda remove pandas -y --force -pip uninstall -y pandas +conda remove pandas -y --force || true +pip uninstall -y pandas || true echo echo "[no installed pandas]" conda list pandas +if [ -n "$LOCALE_OVERRIDE" ]; then + sudo locale-gen "$LOCALE_OVERRIDE" +fi + # # Install the compiler toolchain # if [[ $(uname) == Linux ]]; then # if [[ "$CONDA_SUBDIR" == "linux-32" || "$BITS32" == "yes" ]] ; then diff --git a/ci/install_travis.sh b/ci/install_travis.sh index fd4a36f86db6c..d1a940f119228 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -80,9 +80,9 @@ echo echo "[create env]" # create our environment -time conda env create -q -n pandas --file="${ENV_FILE}" || exit 1 +time conda env create -q --file="${ENV_FILE}" || exit 1 -source activate pandas +source activate pandas-dev # remove any installed pandas package # w/o removing anything else diff --git a/ci/lint.sh b/ci/lint.sh deleted file mode 100755 index 533e1d18d8e0e..0000000000000 --- a/ci/lint.sh +++ /dev/null @@ -1,196 +0,0 @@ -#!/bin/bash - -echo "inside $0" - -source activate pandas - -RET=0 - -if [ "$LINT" ]; then - - # We're ignoring the following codes across the board - #E402, # module level import not at top of file - #E731, # do not assign a lambda expression, use a def - #E741, # do not use variables named 'l', 'O', or 'I' - #W503, # line break before binary operator - #C406, # Unnecessary (list/tuple) literal - rewrite as a dict literal. - #C408, # Unnecessary (dict/list/tuple) call - rewrite as a literal. - #C409, # Unnecessary (list/tuple) passed to tuple() - (remove the outer call to tuple()/rewrite as a tuple literal). - #C410 # Unnecessary (list/tuple) passed to list() - (remove the outer call to list()/rewrite as a list literal). - - # pandas/_libs/src is C code, so no need to search there. - echo "Linting *.py" - flake8 pandas --filename=*.py --exclude pandas/_libs/src --ignore=C406,C408,C409,E402,E731,E741,W503 - if [ $? -ne "0" ]; then - RET=1 - fi - - flake8 scripts/tests --filename=*.py - if [ $? -ne "0" ]; then - RET=1 - fi - echo "Linting *.py DONE" - - echo "Linting setup.py" - flake8 setup.py --ignore=E402,E731,E741,W503 - if [ $? -ne "0" ]; then - RET=1 - fi - echo "Linting setup.py DONE" - - echo "Linting asv_bench/benchmarks/" - flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/*.py --ignore=F811,C406,C408,C409,C410 - if [ $? -ne "0" ]; then - RET=1 - fi - echo "Linting asv_bench/benchmarks/*.py DONE" - - echo "Linting scripts/*.py" - flake8 scripts --filename=*.py --ignore=C408,E402,E731,E741,W503 - if [ $? -ne "0" ]; then - RET=1 - fi - echo "Linting scripts/*.py DONE" - - echo "Linting doc scripts" - flake8 doc/make.py doc/source/conf.py --ignore=E402,E731,E741,W503 - if [ $? -ne "0" ]; then - RET=1 - fi - echo "Linting doc scripts DONE" - - echo "Linting *.pyx" - flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403,C400,C401,C402,C403,C404,C405,C406,C407,C408,C409,C410,C411 - if [ $? -ne "0" ]; then - RET=1 - fi - echo "Linting *.pyx DONE" - - echo "Linting *.pxi.in" - for path in 'src' - do - echo "linting -> pandas/$path" - flake8 pandas/$path --filename=*.pxi.in --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 - if [ $? -ne "0" ]; then - RET=1 - fi - done - echo "Linting *.pxi.in DONE" - - echo "Linting *.pxd" - for path in '_libs' - do - echo "linting -> pandas/$path" - flake8 pandas/$path --filename=*.pxd --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 - if [ $? -ne "0" ]; then - RET=1 - fi - done - echo "Linting *.pxd DONE" - - # readability/casting: Warnings about C casting instead of C++ casting - # runtime/int: Warnings about using C number types instead of C++ ones - # build/include_subdir: Warnings about prefacing included header files with directory - - # We don't lint all C files because we don't want to lint any that are built - # from Cython files nor do we want to lint C files that we didn't modify for - # this particular codebase (e.g. src/headers, src/klib, src/msgpack). However, - # we can lint all header files since they aren't "generated" like C files are. - echo "Linting *.c and *.h" - for path in '*.h' 'parser' 'ujson' - do - echo "linting -> pandas/_libs/src/$path" - cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/_libs/src/$path - if [ $? -ne "0" ]; then - RET=1 - fi - done - echo "linting -> pandas/_libs/tslibs/src/datetime" - cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/_libs/tslibs/src/datetime - if [ $? -ne "0" ]; then - RET=1 - fi - echo "Linting *.c and *.h DONE" - - echo "Check for invalid testing" - - # Check for the following code in testing: - # - # np.testing - # np.array_equal - grep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/ - - if [ $? = "0" ]; then - RET=1 - fi - - # Check for pytest.warns - grep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ - - if [ $? = "0" ]; then - RET=1 - fi - - # Check for the following code in the extension array base tests - # tm.assert_frame_equal - # tm.assert_series_equal - grep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base - - if [ $? = "0" ]; then - RET=1 - fi - - echo "Check for invalid testing DONE" - - # Check for imports from pandas.core.common instead - # of `import pandas.core.common as com` - echo "Check for non-standard imports" - grep -R --include="*.py*" -E "from pandas.core.common import " pandas - if [ $? = "0" ]; then - RET=1 - fi - echo "Check for non-standard imports DONE" - - echo "Check for incorrect sphinx directives" - SPHINX_DIRECTIVES=$(echo \ - "autosummary|contents|currentmodule|deprecated|function|image|"\ - "important|include|ipython|literalinclude|math|module|note|raw|"\ - "seealso|toctree|versionadded|versionchanged|warning" | tr -d "[:space:]") - for path in './pandas' './doc/source' - do - grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. ($SPHINX_DIRECTIVES):[^:]" $path - if [ $? = "0" ]; then - RET=1 - fi - done - echo "Check for incorrect sphinx directives DONE" - - echo "Check for deprecated messages without sphinx directive" - grep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas - - if [ $? = "0" ]; then - RET=1 - fi - echo "Check for deprecated messages without sphinx directive DONE" - - echo "Check for old-style classes" - grep -R --include="*.py" -E "class\s\S*[^)]:" pandas scripts - - if [ $? = "0" ]; then - RET=1 - fi - echo "Check for old-style classes DONE" - - echo "Check for backticks incorrectly rendering because of missing spaces" - grep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ - - if [ $? = "0" ]; then - RET=1 - fi - echo "Check for backticks incorrectly rendering because of missing spaces DONE" - -else - echo "NOT Linting" -fi - -exit $RET diff --git a/ci/print_versions.py b/ci/print_versions.py deleted file mode 100755 index 8be795174d76d..0000000000000 --- a/ci/print_versions.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python - - -def show_versions(as_json=False): - import imp - import os - fn = __file__ - this_dir = os.path.dirname(fn) - pandas_dir = os.path.abspath(os.path.join(this_dir, "..")) - sv_path = os.path.join(pandas_dir, 'pandas', 'util') - mod = imp.load_module( - 'pvmod', *imp.find_module('print_versions', [sv_path])) - return mod.show_versions(as_json) - - -if __name__ == '__main__': - # optparse is 2.6-safe - from optparse import OptionParser - parser = OptionParser() - parser.add_option("-j", "--json", metavar="FILE", nargs=1, - help="Save output as JSON into file, pass in '-' to output to stdout") - - (options, args) = parser.parse_args() - - if options.json == "-": - options.json = True - - show_versions(as_json=options.json) diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt deleted file mode 100644 index 376fdb1e14e3a..0000000000000 --- a/ci/requirements-optional-conda.txt +++ /dev/null @@ -1,29 +0,0 @@ -beautifulsoup4>=4.2.1 -blosc -bottleneck -fastparquet -feather-format -gcsfs -html5lib -ipython>=5.6.0 -ipykernel -jinja2 -lxml -matplotlib -nbsphinx -numexpr -openpyxl=2.5.5 -pyarrow -pymysql -pytables -pytest-cov -pytest-xdist -s3fs -scipy -seaborn -sqlalchemy -statsmodels -xarray -xlrd -xlsxwriter -xlwt diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt deleted file mode 100644 index 09ce8e59a3b46..0000000000000 --- a/ci/requirements-optional-pip.txt +++ /dev/null @@ -1,31 +0,0 @@ -# This file was autogenerated by scripts/convert_deps.py -# Do not modify directly -beautifulsoup4>=4.2.1 -blosc -bottleneck -fastparquet -feather-format -gcsfs -html5lib -ipython>=5.6.0 -ipykernel -jinja2 -lxml -matplotlib -nbsphinx -numexpr -openpyxl==2.5.5 -pyarrow -pymysql -tables -pytest-cov -pytest-xdist -s3fs -scipy -seaborn -sqlalchemy -statsmodels -xarray -xlrd -xlsxwriter -xlwt \ No newline at end of file diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt deleted file mode 100644 index 68fffe5d0df09..0000000000000 --- a/ci/requirements_dev.txt +++ /dev/null @@ -1,14 +0,0 @@ -# This file was autogenerated by scripts/convert_deps.py -# Do not modify directly -Cython>=0.28.2 -NumPy -flake8 -flake8-comprehensions -hypothesis>=3.58.0 -moto -pytest>=3.6 -python-dateutil>=2.5.0 -pytz -setuptools>=24.2.0 -sphinx -sphinxcontrib-spelling \ No newline at end of file diff --git a/ci/run_circle.sh b/ci/run_circle.sh deleted file mode 100755 index fc2a8b849a354..0000000000000 --- a/ci/run_circle.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -echo "[running tests]" -export PATH="$MINICONDA_DIR/bin:$PATH" - -source activate pandas - -echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" -pytest --strict --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/run_tests.sh b/ci/run_tests.sh new file mode 100755 index 0000000000000..ee46da9f52eab --- /dev/null +++ b/ci/run_tests.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -e + +if [ "$DOC" ]; then + echo "We are not running pytest as this is a doc-build" + exit 0 +fi + +# Workaround for pytest-xdist flaky collection order +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE" + export LANG="$LOCALE_OVERRIDE" + PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'` + if [[ "$LOCALE_OVERIDE" != "$PANDAS_LOCALE" ]]; then + echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE" + # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed + # exit 1 + fi +fi +if [[ "not network" == *"$PATTERN"* ]]; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + + +if [ -n "$PATTERN" ]; then + PATTERN=" and $PATTERN" +fi + +for TYPE in single multiple +do + if [ "$COVERAGE" ]; then + COVERAGE_FNAME="/tmp/coc-$TYPE.xml" + COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" + fi + + TYPE_PATTERN=$TYPE + NUM_JOBS=1 + if [[ "$TYPE_PATTERN" == "multiple" ]]; then + TYPE_PATTERN="not single" + NUM_JOBS=2 + fi + + PYTEST_CMD="pytest -m \"$TYPE_PATTERN$PATTERN\" -n $NUM_JOBS -s --strict --durations=10 --junitxml=test-data-$TYPE.xml $TEST_ARGS $COVERAGE pandas" + echo $PYTEST_CMD + # if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code + sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret" + + if [[ "$COVERAGE" && $? == 0 ]]; then + echo "uploading coverage for $TYPE tests" + echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" + bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME + fi +done diff --git a/ci/script_multi.sh b/ci/script_multi.sh deleted file mode 100755 index dcc5a14d7b3b4..0000000000000 --- a/ci/script_multi.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -e - -echo "[script multi]" - -source activate pandas - -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE"; - echo "Setting LC_ALL to $LOCALE_OVERRIDE" - - pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' - python -c "$pycmd" -fi - -# Enforce absent network during testing by faking a proxy -if echo "$TEST_ARGS" | grep -e --skip-network -q; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -# Workaround for pytest-xdist flaky collection order -# https://github.com/pytest-dev/pytest/issues/920 -# https://github.com/pytest-dev/pytest/issues/1075 -export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -echo PYTHONHASHSEED=$PYTHONHASHSEED - -if [ "$DOC" ]; then - echo "We are not running pytest as this is a doc-build" - -elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - -elif [ "$SLOW" ]; then - TEST_ARGS="--only-slow --skip-network" - echo pytest -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - pytest -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - -else - echo pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas # TODO: doctest - -fi - -RET="$?" - -exit "$RET" diff --git a/ci/script_single.sh b/ci/script_single.sh deleted file mode 100755 index 09e7446a2d876..0000000000000 --- a/ci/script_single.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -echo "[script_single]" - -source activate pandas - -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE"; - echo "Setting LC_ALL to $LOCALE_OVERRIDE" - - pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' - python -c "$pycmd" -fi - -if [ "$SLOW" ]; then - TEST_ARGS="--only-slow --skip-network" -fi - -# Enforce absent network during testing by faking a proxy -if echo "$TEST_ARGS" | grep -e --skip-network -q; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -if [ "$DOC" ]; then - echo "We are not running pytest as this is a doc-build" - -elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - - echo pytest -s --strict scripts - pytest -s --strict scripts -else - echo pytest -m "single" --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas - pytest -m "single" --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest - -fi - -RET="$?" - -exit "$RET" diff --git a/ci/show_circle.sh b/ci/show_circle.sh deleted file mode 100755 index bfaa65c1d84f2..0000000000000 --- a/ci/show_circle.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -echo "[installed versions]" - -export PATH="$MINICONDA_DIR/bin:$PATH" -source activate pandas - -python -c "import pandas; pandas.show_versions();" diff --git a/ci/upload_coverage.sh b/ci/upload_coverage.sh deleted file mode 100755 index a7ef2fa908079..0000000000000 --- a/ci/upload_coverage.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -if [ -z "$COVERAGE" ]; then - echo "coverage is not selected for this build" - exit 0 -fi - -source activate pandas - -echo "uploading coverage" -bash <(curl -s https://codecov.io/bash) -Z -c -F single -f /tmp/cov-single.xml -bash <(curl -s https://codecov.io/bash) -Z -c -F multiple -f /tmp/cov-multiple.xml diff --git a/doc/README.rst b/doc/README.rst index 12950d323f5d3..a11ed8d9d03e3 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -1,173 +1 @@ -.. _contributing.docs: - -Contributing to the documentation -================================= - -Whether you are someone who loves writing, teaching, or development, -contributing to the documentation is a huge value. If you don't see yourself -as a developer type, please don't stress and know that we want you to -contribute. You don't even have to be an expert on *pandas* to do so! -Something as simple as rewriting small passages for clarity -as you reference the docs is a simple but effective way to contribute. The -next person to read that passage will be in your debt! - -Actually, there are sections of the docs that are worse off by being written -by experts. If something in the docs doesn't make sense to you, updating the -relevant section after you figure it out is a simple way to ensure it will -help the next person. - -.. contents:: Table of contents: - :local: - - -About the pandas documentation ------------------------------- - -The documentation is written in **reStructuredText**, which is almost like writing -in plain English, and built using `Sphinx `__. The -Sphinx Documentation has an excellent `introduction to reST -`__. Review the Sphinx docs to perform more -complex changes to the documentation as well. - -Some other important things to know about the docs: - -- The pandas documentation consists of two parts: the docstrings in the code - itself and the docs in this folder ``pandas/doc/``. - - The docstrings provide a clear explanation of the usage of the individual - functions, while the documentation in this folder consists of tutorial-like - overviews per topic together with some other information (what's new, - installation, etc). - -- The docstrings follow the **Numpy Docstring Standard** which is used widely - in the Scientific Python community. This standard specifies the format of - the different sections of the docstring. See `this document - `_ - for a detailed explanation, or look at some of the existing functions to - extend it in a similar manner. - -- The tutorials make heavy use of the `ipython directive - `_ sphinx extension. - This directive lets you put code in the documentation which will be run - during the doc build. For example: - - :: - - .. ipython:: python - - x = 2 - x**3 - - will be rendered as - - :: - - In [1]: x = 2 - - In [2]: x**3 - Out[2]: 8 - - This means that almost all code examples in the docs are always run (and the - output saved) during the doc build. This way, they will always be up to date, - but it makes the doc building a bit more complex. - - -How to build the pandas documentation -------------------------------------- - -Requirements -^^^^^^^^^^^^ - -To build the pandas docs there are some extra requirements: you will need to -have ``sphinx`` and ``ipython`` installed. `numpydoc -`_ is used to parse the docstrings that -follow the Numpy Docstring Standard (see above), but you don't need to install -this because a local copy of ``numpydoc`` is included in the pandas source -code. `nbsphinx `_ is used to convert -Jupyter notebooks. You will need to install it if you intend to modify any of -the notebooks included in the documentation. - -Furthermore, it is recommended to have all `optional dependencies -`_ -installed. This is not needed, but be aware that you will see some error -messages. Because all the code in the documentation is executed during the doc -build, the examples using this optional dependencies will generate errors. -Run ``pd.show_versions()`` to get an overview of the installed version of all -dependencies. - -.. warning:: - - Sphinx version >= 1.2.2 or the older 1.1.3 is required. - -Building pandas -^^^^^^^^^^^^^^^ - -For a step-by-step overview on how to set up your environment, to work with -the pandas code and git, see `the developer pages -`_. -When you start to work on some docs, be sure to update your code to the latest -development version ('master'):: - - git fetch upstream - git rebase upstream/master - -Often it will be necessary to rebuild the C extension after updating:: - - python setup.py build_ext --inplace - -Building the documentation -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -So how do you build the docs? Navigate to your local folder -``pandas/doc/`` directory in the console and run:: - - python make.py html - -And then you can find the html output in the folder ``pandas/doc/build/html/``. - -The first time it will take quite a while, because it has to run all the code -examples in the documentation and build all generated docstring pages. -In subsequent evocations, sphinx will try to only build the pages that have -been modified. - -If you want to do a full clean build, do:: - - python make.py clean - python make.py build - - -Starting with 0.13.1 you can tell ``make.py`` to compile only a single section -of the docs, greatly reducing the turn-around time for checking your changes. -You will be prompted to delete `.rst` files that aren't required, since the -last committed version can always be restored from git. - -:: - - #omit autosummary and API section - python make.py clean - python make.py --no-api - - # compile the docs with only a single - # section, that which is in indexing.rst - python make.py clean - python make.py --single indexing - -For comparison, a full doc build may take 10 minutes. a ``-no-api`` build -may take 3 minutes and a single section may take 15 seconds. - -Where to start? ---------------- - -There are a number of issues listed under `Docs -`_ -and `good first issue -`_ -where you could start out. - -Or maybe you have an idea of your own, by using pandas, looking for something -in the documentation and thinking 'this can be improved', let's do something -about that! - -Feel free to ask questions on `mailing list -`_ or submit an -issue on Github. +See `contributing.rst `_ in this repo. diff --git a/doc/make.py b/doc/make.py index cab5fa0ed4c52..0a3a7483fcc91 100755 --- a/doc/make.py +++ b/doc/make.py @@ -126,7 +126,12 @@ def _process_single_doc(self, single_doc): self.single_doc = 'api' elif os.path.exists(os.path.join(SOURCE_PATH, single_doc)): self.single_doc_type = 'rst' - self.single_doc = os.path.splitext(os.path.basename(single_doc))[0] + + if 'whatsnew' in single_doc: + basename = single_doc + else: + basename = os.path.basename(single_doc) + self.single_doc = os.path.splitext(basename)[0] elif os.path.exists( os.path.join(SOURCE_PATH, '{}.rst'.format(single_doc))): self.single_doc_type = 'rst' diff --git a/doc/source/10min.rst b/doc/source/10min.rst index fbbe94a72c71e..e04a8253e0bef 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -5,19 +5,19 @@ .. ipython:: python :suppress: + import os import numpy as np + import pandas as pd - import os + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import matplotlib - # matplotlib.style.use('default') pd.options.display.max_rows = 15 - #### portions of this were borrowed from the - #### Pandas cheatsheet - #### created during the PyData Workshop-Sprint 2012 - #### Hannah Chen, Henry Chow, Eric Cox, Robert Mauriello + # portions of this were borrowed from the + # Pandas cheatsheet + # created during the PyData Workshop-Sprint 2012 + # Hannah Chen, Henry Chow, Eric Cox, Robert Mauriello ******************** @@ -31,9 +31,8 @@ Customarily, we import as follows: .. ipython:: python - import pandas as pd import numpy as np - import matplotlib.pyplot as plt + import pandas as pd Object Creation --------------- @@ -45,7 +44,7 @@ a default integer index: .. ipython:: python - s = pd.Series([1,3,5,np.nan,6,8]) + s = pd.Series([1, 3, 5, np.nan, 6, 8]) s Creating a :class:`DataFrame` by passing a NumPy array, with a datetime index @@ -55,19 +54,19 @@ and labeled columns: dates = pd.date_range('20130101', periods=6) dates - df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) df Creating a ``DataFrame`` by passing a dict of objects that can be converted to series-like. .. ipython:: python - df2 = pd.DataFrame({ 'A' : 1., - 'B' : pd.Timestamp('20130102'), - 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), - 'D' : np.array([3] * 4,dtype='int32'), - 'E' : pd.Categorical(["test","train","test","train"]), - 'F' : 'foo' }) + df2 = pd.DataFrame({'A': 1., + 'B': pd.Timestamp('20130102'), + 'C': pd.Series(1, index=list(range(4)), dtype='float32'), + 'D': np.array([3] * 4, dtype='int32'), + 'E': pd.Categorical(["test", "train", "test", "train"]), + 'F': 'foo'}) df2 The columns of the resulting ``DataFrame`` have different @@ -114,13 +113,40 @@ Here is how to view the top and bottom rows of the frame: df.head() df.tail(3) -Display the index, columns, and the underlying NumPy data: +Display the index, columns: .. ipython:: python df.index df.columns - df.values + +:meth:`DataFrame.to_numpy` gives a NumPy representation of the underlying data. +Note that his can be an expensive operation when your :class:`DataFrame` has +columns with different data types, which comes down to a fundamental difference +between pandas and NumPy: **NumPy arrays have one dtype for the entire array, +while pandas DataFrames have one dtype per column**. When you call +:meth:`DataFrame.to_numpy`, pandas will find the NumPy dtype that can hold *all* +of the dtypes in the DataFrame. This may end up being ``object``, which requires +casting every value to a Python object. + +For ``df``, our :class:`DataFrame` of all floating-point values, +:meth:`DataFrame.to_numpy` is fast and doesn't require copying data. + +.. ipython:: python + + df.to_numpy() + +For ``df2``, the :class:`DataFrame` with multiple dtypes, +:meth:`DataFrame.to_numpy` is relatively expensive. + +.. ipython:: python + + df2.to_numpy() + +.. note:: + + :meth:`DataFrame.to_numpy` does *not* include the index or column + labels in the output. :func:`~DataFrame.describe` shows a quick statistic summary of your data: @@ -190,31 +216,31 @@ Selecting on a multi-axis by label: .. ipython:: python - df.loc[:,['A','B']] + df.loc[:, ['A', 'B']] Showing label slicing, both endpoints are *included*: .. ipython:: python - df.loc['20130102':'20130104',['A','B']] + df.loc['20130102':'20130104', ['A', 'B']] Reduction in the dimensions of the returned object: .. ipython:: python - df.loc['20130102',['A','B']] + df.loc['20130102', ['A', 'B']] For getting a scalar value: .. ipython:: python - df.loc[dates[0],'A'] + df.loc[dates[0], 'A'] For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python - df.at[dates[0],'A'] + df.at[dates[0], 'A'] Selection by Position ~~~~~~~~~~~~~~~~~~~~~ @@ -231,37 +257,37 @@ By integer slices, acting similar to numpy/python: .. ipython:: python - df.iloc[3:5,0:2] + df.iloc[3:5, 0:2] By lists of integer position locations, similar to the numpy/python style: .. ipython:: python - df.iloc[[1,2,4],[0,2]] + df.iloc[[1, 2, 4], [0, 2]] For slicing rows explicitly: .. ipython:: python - df.iloc[1:3,:] + df.iloc[1:3, :] For slicing columns explicitly: .. ipython:: python - df.iloc[:,1:3] + df.iloc[:, 1:3] For getting a value explicitly: .. ipython:: python - df.iloc[1,1] + df.iloc[1, 1] For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python - df.iat[1,1] + df.iat[1, 1] Boolean Indexing ~~~~~~~~~~~~~~~~ @@ -283,9 +309,9 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E'] = ['one', 'one','two','three','four','three'] + df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] df2 - df2[df2['E'].isin(['two','four'])] + df2[df2['E'].isin(['two', 'four'])] Setting ~~~~~~~ @@ -295,7 +321,7 @@ by the indexes. .. ipython:: python - s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) + s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6)) s1 df['F'] = s1 @@ -303,19 +329,19 @@ Setting values by label: .. ipython:: python - df.at[dates[0],'A'] = 0 + df.at[dates[0], 'A'] = 0 Setting values by position: .. ipython:: python - df.iat[0,1] = 0 + df.iat[0, 1] = 0 Setting by assigning with a NumPy array: .. ipython:: python - df.loc[:,'D'] = np.array([5] * len(df)) + df.loc[:, 'D'] = np.array([5] * len(df)) The result of the prior setting operations. @@ -345,7 +371,7 @@ returns a copy of the data. .. ipython:: python df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) - df1.loc[dates[0]:dates[1],'E'] = 1 + df1.loc[dates[0]:dates[1], 'E'] = 1 df1 To drop any rows that have missing data. @@ -394,7 +420,7 @@ In addition, pandas automatically broadcasts along the specified dimension. .. ipython:: python - s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) + s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) s df.sub(s, axis='index') @@ -492,7 +518,7 @@ section. .. ipython:: python - df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df s = df.iloc[3] df.append(s, ignore_index=True) @@ -512,12 +538,12 @@ See the :ref:`Grouping section `. .. ipython:: python - df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C' : np.random.randn(8), - 'D' : np.random.randn(8)}) + df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) df Grouping and then applying the :meth:`~DataFrame.sum` function to the resulting @@ -532,7 +558,7 @@ apply the ``sum`` function. .. ipython:: python - df.groupby(['A','B']).sum() + df.groupby(['A', 'B']).sum() Reshaping --------- @@ -578,11 +604,11 @@ See the section on :ref:`Pivot Tables `. .. ipython:: python - df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, - 'B' : ['A', 'B', 'C'] * 4, - 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, - 'D' : np.random.randn(12), - 'E' : np.random.randn(12)}) + df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3, + 'B': ['A', 'B', 'C'] * 4, + 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, + 'D': np.random.randn(12), + 'E': np.random.randn(12)}) df We can produce pivot tables from this data very easily: @@ -653,7 +679,8 @@ pandas can include categorical data in a ``DataFrame``. For full docs, see the .. ipython:: python - df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], + "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) Convert the raw grades to a categorical data type. @@ -674,7 +701,8 @@ Reorder the categories and simultaneously add the missing categories (methods un .. ipython:: python - df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", + "good", "very good"]) df["grade"] Sorting is per order in the categories, not lexical order. @@ -703,7 +731,8 @@ See the :ref:`Plotting ` docs. .. ipython:: python - ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -718,8 +747,10 @@ of the columns with labels: columns=['A', 'B', 'C', 'D']) df = df.cumsum() + plt.figure() + df.plot() @savefig frame_plot_basic.png - plt.figure(); df.plot(); plt.legend(loc='best') + plt.legend(loc='best') Getting Data In/Out ------------------- @@ -753,13 +784,13 @@ Writing to a HDF5 Store. .. ipython:: python - df.to_hdf('foo.h5','df') + df.to_hdf('foo.h5', 'df') Reading from a HDF5 Store. .. ipython:: python - pd.read_hdf('foo.h5','df') + pd.read_hdf('foo.h5', 'df') .. ipython:: python :suppress: @@ -796,7 +827,7 @@ If you are attempting to perform an operation you might see an exception like: .. code-block:: python >>> if pd.Series([False, True, False]): - print("I was true") + ... print("I was true") Traceback ... ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 835c4cc9d4ab3..39082ef7a4c69 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -9,7 +9,7 @@ import pandas as pd np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 ****************************** MultiIndex / Advanced Indexing @@ -49,6 +49,11 @@ analysis. See the :ref:`cookbook` for some advanced strategies. +.. versionchanged:: 0.24.0 + + :attr:`MultiIndex.labels` has been renamed to :attr:`MultiIndex.codes` + and :attr:`MultiIndex.set_labels` to :attr:`MultiIndex.set_codes`. + Creating a MultiIndex (hierarchical index) object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -188,10 +193,10 @@ highly performant. If you want to see only the used levels, you can use the .. ipython:: python - df[['foo','qux']].columns.values + df[['foo', 'qux']].columns.to_numpy() # for a specific level - df[['foo','qux']].columns.get_level_values(0) + df[['foo', 'qux']].columns.get_level_values(0) To reconstruct the ``MultiIndex`` with only the used levels, the :meth:`~MultiIndex.remove_unused_levels` method may be used. @@ -200,7 +205,7 @@ To reconstruct the ``MultiIndex`` with only the used levels, the .. ipython:: python - df[['foo','qux']].columns.remove_unused_levels() + df[['foo', 'qux']].columns.remove_unused_levels() Data alignment and using ``reindex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -235,7 +240,7 @@ keys take the form of tuples. For example, the following works as you would expe df = df.T df - df.loc[('bar', 'two'),] + df.loc[('bar', 'two')] Note that ``df.loc['bar', 'two']`` would also work in this example, but this shorthand notation can lead to ambiguity in general. @@ -318,27 +323,28 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1','A3'),.....), :] + df.loc[(slice('A1', 'A3'), ...), :] # noqa: E999   You should **not** do this:   .. code-block:: python - df.loc[(slice('A1','A3'),.....)] + df.loc[(slice('A1', 'A3'), ...)] # noqa: E999 .. ipython:: python - def mklbl(prefix,n): - return ["%s%s" % (prefix,i) for i in range(n)] + def mklbl(prefix, n): + return ["%s%s" % (prefix, i) for i in range(n)] - miindex = pd.MultiIndex.from_product([mklbl('A',4), - mklbl('B',2), - mklbl('C',4), - mklbl('D',2)]) - micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'), - ('b','foo'),('b','bah')], + miindex = pd.MultiIndex.from_product([mklbl('A', 4), + mklbl('B', 2), + mklbl('C', 4), + mklbl('D', 2)]) + micolumns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1']) - dfmi = pd.DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + dfmi = pd.DataFrame(np.arange(len(miindex) * len(micolumns)) + .reshape((len(miindex), len(micolumns))), index=miindex, columns=micolumns).sort_index().sort_index(axis=1) dfmi @@ -347,7 +353,7 @@ Basic MultiIndex slicing using slices, lists, and labels. .. ipython:: python - dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :] + dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] You can use :class:`pandas.IndexSlice` to facilitate a more natural syntax @@ -412,7 +418,7 @@ selecting data at a particular level of a ``MultiIndex`` easier. .. ipython:: python # using the slicers - df.loc[(slice(None),'one'),:] + df.loc[(slice(None), 'one'), :] You can also select on the columns with ``xs``, by providing the axis argument. @@ -425,7 +431,7 @@ providing the axis argument. .. ipython:: python # using the slicers - df.loc[:,(slice(None),'one')] + df.loc[:, (slice(None), 'one')] ``xs`` also allows selection with multiple keys. @@ -436,7 +442,7 @@ providing the axis argument. .. ipython:: python # using the slicers - df.loc[:,('bar','one')] + df.loc[:, ('bar', 'one')] You can pass ``drop_level=False`` to ``xs`` to retain the level that was selected. @@ -467,9 +473,9 @@ values across a level. For instance: .. ipython:: python - midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], - labels=[[1,1,0,0],[1,0,1,0]]) - df = pd.DataFrame(np.random.randn(4,2), index=midx) + midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], + codes=[[1, 1, 0, 0], [1, 0, 1, 0]]) + df = pd.DataFrame(np.random.randn(4, 2), index=midx) df df2 = df.mean(level=0) df2 @@ -501,7 +507,48 @@ method, allowing you to permute the hierarchical index levels in one step: .. ipython:: python - df[:5].reorder_levels([1,0], axis=0) + df[:5].reorder_levels([1, 0], axis=0) + +.. _advanced.index_names: + +Renaming names of an ``Index`` or ``MultiIndex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`~DataFrame.rename` method is used to rename the labels of a +``MultiIndex``, and is typically used to rename the columns of a ``DataFrame``. +The ``columns`` argument of ``rename`` allows a dictionary to be specified +that includes only the columns you wish to rename. + +.. ipython:: python + + df.rename(columns={0: "col0", 1: "col1"}) + +This method can also be used to rename specific labels of the main index +of the ``DataFrame``. + +.. ipython:: python + + df.rename(index={"one": "two", "y": "z"}) + +The :meth:`~DataFrame.rename_axis` method is used to rename the name of a +``Index`` or ``MultiIndex``. In particular, the names of the levels of a +``MultiIndex`` can be specified, which is useful if ``reset_index()`` is later +used to move the values from the ``MultiIndex`` to a column. + +.. ipython:: python + + df.rename_axis(index=['abc', 'def']) + +Note that the columns of a ``DataFrame`` are an index, so that using +``rename_axis`` with the ``columns`` argument will change the name of that +index. + +.. ipython:: python + + df.rename_axis(columns="Cols").columns + +Both ``rename`` and ``rename_axis`` support specifying a dictionary, +``Series`` or a mapping function to map labels/names to new values. Sorting a ``MultiIndex`` ------------------------ @@ -564,7 +611,7 @@ Furthermore, if you try to index something that is not fully lexsorted, this can .. code-block:: ipython - In [5]: dfm.loc[(0,'y'):(1, 'z')] + In [5]: dfm.loc[(0, 'y'):(1, 'z')] UnsortedIndexError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' The :meth:`~MultiIndex.is_lexsorted` method on a ``MultiIndex`` shows if the @@ -586,7 +633,7 @@ And now selection works as expected. .. ipython:: python - dfm.loc[(0,'y'):(1, 'z')] + dfm.loc[(0, 'y'):(1, 'z')] Take Methods ------------ @@ -647,12 +694,12 @@ faster than fancy indexing. indexer = np.arange(10000) random.shuffle(indexer) - timeit arr[indexer] - timeit arr.take(indexer, axis=0) + %timeit arr[indexer] + %timeit arr.take(indexer, axis=0) ser = pd.Series(arr[:, 0]) - timeit ser.iloc[indexer] - timeit ser.take(indexer) + %timeit ser.iloc[indexer] + %timeit ser.take(indexer) .. _indexing.index_types: @@ -661,7 +708,7 @@ Index Types We have discussed ``MultiIndex`` in the previous sections pretty extensively. Documentation about ``DatetimeIndex`` and ``PeriodIndex`` are shown :ref:`here `, -and documentation about ``TimedeltaIndex`` is found :ref:`here `. +and documentation about ``TimedeltaIndex`` is found :ref:`here `. In the following sub-sections we will highlight some other index types. @@ -677,7 +724,6 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python from pandas.api.types import CategoricalDtype - df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) @@ -738,18 +784,17 @@ values **not** in the categories, similarly to how you can reindex **any** panda Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories or a ``TypeError`` will be raised. - .. code-block:: python + .. code-block:: ipython - In [9]: df3 = pd.DataFrame({'A' : np.arange(6), - 'B' : pd.Series(list('aabbca')).astype('category')}) + In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')}) - In [11]: df3 = df3.set_index('B') + In [11]: df3 = df3.set_index('B') - In [11]: df3.index - Out[11]: CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'], categories=[u'a', u'b', u'c'], ordered=False, name=u'B', dtype='category') + In [11]: df3.index + Out[11]: CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'], categories=[u'a', u'b', u'c'], ordered=False, name=u'B', dtype='category') - In [12]: pd.concat([df2, df3] - TypeError: categories must match existing categories when appending + In [12]: pd.concat([df2, df3]) + TypeError: categories must match existing categories when appending .. _indexing.rangeindex: @@ -842,11 +887,11 @@ example, be millisecond offsets. .. ipython:: python - dfir = pd.concat([pd.DataFrame(np.random.randn(5,2), + dfir = pd.concat([pd.DataFrame(np.random.randn(5, 2), index=np.arange(5) * 250.0, columns=list('AB')), - pd.DataFrame(np.random.randn(6,2), - index=np.arange(4,10) * 250.1, + pd.DataFrame(np.random.randn(6, 2), + index=np.arange(4, 10) * 250.1, columns=list('AB'))]) dfir @@ -855,7 +900,7 @@ Selection operations then will always work on a value basis, for all selection o .. ipython:: python dfir[0:1000.4] - dfir.loc[0:1001,'A'] + dfir.loc[0:1001, 'A'] dfir.loc[1000.4] You could retrieve the first 1 second (1000 ms) of data as such: @@ -893,7 +938,7 @@ An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index .. ipython:: python df = pd.DataFrame({'A': [1, 2, 3, 4]}, - index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])) + index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])) df Label based indexing via ``.loc`` along the edges of an interval works as you would expect, @@ -973,7 +1018,8 @@ in the resulting ``IntervalIndex``: pd.interval_range(start=0, end=6, periods=4) - pd.interval_range(pd.Timestamp('2018-01-01'), pd.Timestamp('2018-02-28'), periods=3) + pd.interval_range(pd.Timestamp('2018-01-01'), + pd.Timestamp('2018-02-28'), periods=3) Miscellaneous indexing FAQ -------------------------- @@ -1010,7 +1056,7 @@ normal Python ``list``. Monotonicity of an index can be tested with the :meth:`~ .. ipython:: python - df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5))) + df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=['data'], data=list(range(5))) df.index.is_monotonic_increasing # no rows 0 or 1, but still returns rows 2, 3 (both of them), and 4: @@ -1024,13 +1070,14 @@ On the other hand, if the index is not monotonic, then both slice bounds must be .. ipython:: python - df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6))) + df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5], + columns=['data'], data=list(range(6))) df.index.is_monotonic_increasing # OK because 2 and 4 are in the index df.loc[2:4, :] -.. code-block:: python +.. code-block:: ipython # 0 is not in the index In [9]: df.loc[0:4, :] diff --git a/doc/source/api.rst b/doc/source/api.rst index 073ed8a082a11..1a23587d2ebb5 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -245,6 +245,15 @@ Top-level evaluation eval +Hashing +~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + util.hash_array + util.hash_pandas_object + Testing ~~~~~~~ @@ -506,7 +515,6 @@ Reshaping, sorting Series.repeat Series.squeeze Series.view - Series.sortlevel Combining / joining / merging @@ -842,6 +850,22 @@ Sparse SparseSeries.to_coo SparseSeries.from_coo +.. autosummary:: + :toctree: generated/ + :template: autosummary/accessor_attribute.rst + + Series.sparse.npoints + Series.sparse.density + Series.sparse.fill_value + Series.sparse.sp_values + + +.. autosummary:: + :toctree: generated/ + + Series.sparse.from_coo + Series.sparse.to_coo + .. _api.dataframe: DataFrame @@ -906,7 +930,6 @@ Indexing, iteration DataFrame.loc DataFrame.iloc DataFrame.insert - DataFrame.insert DataFrame.__iter__ DataFrame.items DataFrame.keys @@ -1648,9 +1671,11 @@ IntervalIndex Components IntervalIndex.length IntervalIndex.values IntervalIndex.is_non_overlapping_monotonic + IntervalIndex.is_overlapping IntervalIndex.get_loc IntervalIndex.get_indexer IntervalIndex.set_closed + IntervalIndex.overlaps .. _api.multiindex: @@ -1687,7 +1712,7 @@ MultiIndex Attributes MultiIndex.names MultiIndex.levels - MultiIndex.labels + MultiIndex.codes MultiIndex.nlevels MultiIndex.levshape @@ -1698,8 +1723,9 @@ MultiIndex Components :toctree: generated/ MultiIndex.set_levels - MultiIndex.set_labels + MultiIndex.set_codes MultiIndex.to_hierarchical + MultiIndex.to_flat_index MultiIndex.to_frame MultiIndex.is_lexsorted MultiIndex.sortlevel @@ -2037,6 +2063,7 @@ Properties Interval.mid Interval.open_left Interval.open_right + Interval.overlaps Interval.right Timedelta @@ -2079,6 +2106,62 @@ Methods Timedelta.to_timedelta64 Timedelta.total_seconds +.. _api.dateoffsets: + +Date Offsets +------------ + +.. currentmodule:: pandas.tseries.offsets + +.. autosummary:: + :toctree: generated/ + + DateOffset + BusinessDay + BusinessHour + CustomBusinessDay + CustomBusinessHour + MonthOffset + MonthEnd + MonthBegin + BusinessMonthEnd + BusinessMonthBegin + CustomBusinessMonthEnd + CustomBusinessMonthBegin + SemiMonthOffset + SemiMonthEnd + SemiMonthBegin + Week + WeekOfMonth + LastWeekOfMonth + QuarterOffset + BQuarterEnd + BQuarterBegin + QuarterEnd + QuarterBegin + YearOffset + BYearEnd + BYearBegin + YearEnd + YearBegin + FY5253 + FY5253Quarter + Easter + Tick + Day + Hour + Minute + Second + Milli + Micro + Nano + BDay + BMonthEnd + BMonthBegin + CBMonthEnd + CBMonthBegin + CDay + .. _api.frequencies: Frequencies @@ -2400,6 +2483,7 @@ Style Application Styler.set_properties Styler.set_uuid Styler.clear + Styler.pipe Builtin Styles ~~~~~~~~~~~~~~ diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 6eeb97349100a..25e2c8cd1ff9a 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -5,6 +5,7 @@ import numpy as np import pandas as pd + np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 @@ -45,8 +46,8 @@ of elements to display is five, but you may pass a custom number. .. _basics.attrs: -Attributes and the raw ndarray(s) ---------------------------------- +Attributes and Underlying Data +------------------------------ pandas objects have a number of attributes enabling you to access the metadata @@ -64,14 +65,43 @@ Note, **these attributes can be safely assigned to**! df.columns = [x.lower() for x in df.columns] df -To get the actual data inside a data structure, one need only access the -**values** property: +Pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be +thought of as containers for arrays, which hold the actual data and do the +actual computation. For many types, the underlying array is a +:class:`numpy.ndarray`. However, pandas and 3rd party libraries may *extend* +NumPy's type system to add support for custom arrays +(see :ref:`basics.dtypes`). + +To get the actual data inside a :class:`Index` or :class:`Series`, use +the **array** property .. ipython:: python - s.values - df.values - wp.values + s.array + s.index.array + +Depending on the data type (see :ref:`basics.dtypes`), :attr:`~Series.array` +be either a NumPy array or an :ref:`ExtensionArray `. +If you know you need a NumPy array, use :meth:`~Series.to_numpy` +or :meth:`numpy.asarray`. + +.. ipython:: python + + s.to_numpy() + np.asarray(s) + +For Series and Indexes backed by NumPy arrays (like we have here), this will +be the same as :attr:`~Series.array`. When the Series or Index is backed by +a :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy` +may involve copying data and coercing values. + +Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more +complex. When your ``DataFrame`` only has a single data type for all the +columns, :atr:`DataFrame.to_numpy` will return the underlying data: + +.. ipython:: python + + df.to_numpy() If a DataFrame or Panel contains homogeneously-typed data, the ndarray can actually be modified in-place, and the changes will be reflected in the data @@ -86,6 +116,21 @@ unlike the axis labels, cannot be assigned to. strings are involved, the result will be of object dtype. If there are only floats and integers, the resulting array will be of float dtype. +In the past, pandas recommended :attr:`Series.values` or :attr:`DataFrame.values` +for extracting the data from a Series or DataFrame. You'll still find references +to these in old code bases and online. Going forward, we recommend avoiding +``.values`` and using ``.array`` or ``.to_numpy()``. ``.values`` has the following +drawbacks: + +1. When your Series contains an :ref:`extension type `, it's + unclear whether :attr:`Series.values` returns a NumPy array or the extension array. + :attr:`Series.array` will always return the actual array backing the Series, + while :meth:`Series.to_numpy` will always return a NumPy array. +2. When your DataFrame contains a mixture of data types, :attr:`DataFrame.values` may + involve copying data and coercing values to a common dtype, a relatively expensive + operation. :meth:`DataFrame.to_numpy`, being a method, makes it clearer that the + returned NumPy array may not be a view on the same data in the DataFrame. + .. _basics.accelerate: Accelerated operations @@ -149,9 +194,10 @@ either match on the *index* or *columns* via the **axis** keyword: .. ipython:: python - df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame({ + 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), + 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), + 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) df row = df.iloc[1] column = df['two'] @@ -172,8 +218,9 @@ Furthermore you can align a level of a MultiIndexed DataFrame with a Series. .. ipython:: python dfmi = df.copy() - dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], - names=['first','second']) + dfmi.index = pd.MultiIndex.from_tuples([(1, 'a'), (1, 'b'), + (1, 'c'), (2, 'a')], + names=['first', 'second']) dfmi.sub(column, axis=0, level='second') With Panel, describing the matching behavior is a bit more difficult, so @@ -306,18 +353,18 @@ To evaluate single-element pandas objects in a boolean context, use the method .. code-block:: python - >>> if df: - ... + >>> if df: + ... pass Or .. code-block:: python - >>> df and df2 + >>> df and df2 These will both raise errors, as you are trying to compare multiple values. - .. code-block:: python + .. code-block:: python-traceback ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). @@ -329,17 +376,17 @@ Comparing if objects are equivalent ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Often you may find that there is more than one way to compute the same -result. As a simple example, consider ``df+df`` and ``df*2``. To test +result. As a simple example, consider ``df + df`` and ``df * 2``. To test that these two computations produce the same result, given the tools -shown above, you might imagine using ``(df+df == df*2).all()``. But in +shown above, you might imagine using ``(df + df == df * 2).all()``. But in fact, this expression is False: .. ipython:: python - df+df == df*2 - (df+df == df*2).all() + df + df == df * 2 + (df + df == df * 2).all() -Notice that the boolean DataFrame ``df+df == df*2`` contains some False values! +Notice that the boolean DataFrame ``df + df == df * 2`` contains some False values! This is because NaNs do not compare as equals: .. ipython:: python @@ -352,15 +399,15 @@ corresponding locations treated as equal. .. ipython:: python - (df+df).equals(df*2) + (df + df).equals(df * 2) Note that the Series or DataFrame index needs to be in the same order for equality to be True: .. ipython:: python - df1 = pd.DataFrame({'col':['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) + df1 = pd.DataFrame({'col': ['foo', 0, np.nan]}) + df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0]) df1.equals(df2) df1.equals(df2.sort_index()) @@ -423,10 +470,10 @@ which we illustrate: .. ipython:: python - df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan], - 'B' : [np.nan, 2., 3., np.nan, 6.]}) - df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], - 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan], + 'B': [np.nan, 2., 3., np.nan, 6.]}) + df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], + 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) df1 df2 df1.combine_first(df2) @@ -443,7 +490,8 @@ So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: .. ipython:: python - combiner = lambda x, y: np.where(pd.isna(x), y, x) + def combiner(x, y): + np.where(pd.isna(x), y, x) df1.combine(df2, combiner) .. _basics.stats: @@ -537,7 +585,7 @@ will exclude NAs on Series input by default: .. ipython:: python np.mean(df['one']) - np.mean(df['one'].values) + np.mean(df['one'].to_numpy()) :meth:`Series.nunique` will return the number of unique non-NA values in a Series: @@ -546,7 +594,7 @@ Series: series = pd.Series(np.random.randn(500)) series[20:500] = np.nan - series[10:20] = 5 + series[10:20] = 5 series.nunique() .. _basics.describe: @@ -563,7 +611,8 @@ course): series = pd.Series(np.random.randn(1000)) series[::2] = np.nan series.describe() - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.iloc[::2] = np.nan frame.describe() @@ -619,7 +668,7 @@ corresponding values: s1 s1.idxmin(), s1.idxmax() - df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C']) + df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C']) df1 df1.idxmin(axis=0) df1.idxmax(axis=1) @@ -732,9 +781,8 @@ with the equivalent .. code-block:: python >>> (df.pipe(h) - .pipe(g, arg1=1) - .pipe(f, arg2=2, arg3=3) - ) + ... .pipe(g, arg1=1) + ... .pipe(f, arg2=2, arg3=3)) Pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions @@ -754,11 +802,11 @@ For example, we can fit a regression using statsmodels. Their API expects a form bb = pd.read_csv('data/baseball.csv', index_col='id') (bb.query('h > 0') - .assign(ln_h = lambda df: np.log(df.h)) + .assign(ln_h=lambda df: np.log(df.h)) .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') .fit() .summary() - ) + ) The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which have introduced the popular ``(%>%)`` (read pipe) operator for R_. @@ -767,7 +815,7 @@ We encourage you to view the source code of :meth:`~DataFrame.pipe`. .. _dplyr: https://github.com/hadley/dplyr .. _magrittr: https://github.com/smbache/magrittr -.. _R: http://www.r-project.org +.. _R: https://www.r-project.org Row or Column-wise Function Application @@ -835,7 +883,7 @@ Series operation on each column or row: tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], index=pd.date_range('1/1/2000', periods=10)) - tsdf.values[3:7] = np.nan + tsdf.iloc[3:7] = np.nan .. ipython:: python @@ -881,7 +929,8 @@ output: tsdf.agg('sum') - # these are equivalent to a ``.sum()`` because we are aggregating on a single function + # these are equivalent to a ``.sum()`` because we are aggregating + # on a single function tsdf.sum() Single aggregations on a ``Series`` this will return a scalar value: @@ -925,7 +974,7 @@ Passing a named function will yield that name for the row: .. ipython:: python def mymean(x): - return x.mean() + return x.mean() tsdf.A.agg(['sum', mymean]) @@ -1039,14 +1088,14 @@ will be the names of the transforming functions. .. ipython:: python - tsdf.transform([np.abs, lambda x: x+1]) + tsdf.transform([np.abs, lambda x: x + 1]) Passing multiple functions to a Series will yield a DataFrame. The resulting column names will be the transforming functions. .. ipython:: python - tsdf.A.transform([np.abs, lambda x: x+1]) + tsdf.A.transform([np.abs, lambda x: x + 1]) Transforming with a dict @@ -1057,7 +1106,7 @@ Passing a dict of functions will allow selective transforming per column. .. ipython:: python - tsdf.transform({'A': np.abs, 'B': lambda x: x+1}) + tsdf.transform({'A': np.abs, 'B': lambda x: x + 1}) Passing a dict of lists will generate a MultiIndexed DataFrame with these selective transforms. @@ -1065,7 +1114,7 @@ selective transforms. .. ipython:: python :okwarning: - tsdf.transform({'A': np.abs, 'B': [lambda x: x+1, 'sqrt']}) + tsdf.transform({'A': np.abs, 'B': [lambda x: x + 1, 'sqrt']}) .. _basics.elementwise: @@ -1085,7 +1134,10 @@ a single value and returning a single value. For example: .. ipython:: python df4 - f = lambda x: len(str(x)) + + def f(x): + return len(str(x)) + df4['one'].map(f) df4.applymap(f) @@ -1097,85 +1149,11 @@ to :ref:`merging/joining functionality `: s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], index=['a', 'b', 'c', 'd', 'e']) - t = pd.Series({'six' : 6., 'seven' : 7.}) + t = pd.Series({'six': 6., 'seven': 7.}) s s.map(t) -.. _basics.apply_panel: - -Applying with a Panel -~~~~~~~~~~~~~~~~~~~~~ - -Applying with a ``Panel`` will pass a ``Series`` to the applied function. If the applied -function returns a ``Series``, the result of the application will be a ``Panel``. If the applied function -reduces to a scalar, the result of the application will be a ``DataFrame``. - -.. ipython:: python - - import pandas.util.testing as tm - panel = tm.makePanel(5) - panel - panel['ItemA'] - -A transformational apply. - -.. ipython:: python - - result = panel.apply(lambda x: x*2, axis='items') - result - result['ItemA'] - -A reduction operation. - -.. ipython:: python - - panel.apply(lambda x: x.dtype, axis='items') - -A similar reduction type operation. - -.. ipython:: python - - panel.apply(lambda x: x.sum(), axis='major_axis') - -This last reduction is equivalent to: - -.. ipython:: python - - panel.sum('major_axis') - -A transformation operation that returns a ``Panel``, but is computing -the z-score across the ``major_axis``. - -.. ipython:: python - - result = panel.apply( - lambda x: (x-x.mean())/x.std(), - axis='major_axis') - result - result['ItemA'] - -Apply can also accept multiple axes in the ``axis`` argument. This will pass a -``DataFrame`` of the cross-section to the applied function. - -.. ipython:: python - - f = lambda x: ((x.T-x.mean(1))/x.std(1)).T - - result = panel.apply(f, axis = ['items','major_axis']) - result - result.loc[:,:,'ItemA'] - -This is equivalent to the following: - -.. ipython:: python - - result = pd.Panel(dict([ (ax, f(panel.loc[:,:,ax])) - for ax in panel.minor_axis ])) - result - result.loc[:,:,'ItemA'] - - .. _basics.reindexing: Reindexing and altering labels @@ -1466,8 +1444,21 @@ for altering the ``Series.name`` attribute. .. _basics.rename_axis: -The Panel class has a related :meth:`~Panel.rename_axis` class which can rename -any of its three axes. +.. versionadded:: 0.24.0 + +The methods :meth:`~DataFrame.rename_axis` and :meth:`~Series.rename_axis` +allow specific names of a `MultiIndex` to be changed (as opposed to the +labels). + +.. ipython:: python + + df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], + 'y': [10, 20, 30, 40, 50, 60]}, + index=pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], + names=['let', 'num'])) + df + df.rename_axis(index={'let': 'abc'}) + df.rename_axis(index=str.upper) .. _basics.iteration: @@ -1488,14 +1479,14 @@ In short, basic iteration (``for i in object``) produces: Thus, for example, iterating over a DataFrame gives you the column names: -.. ipython:: +.. ipython:: python + + df = pd.DataFrame({'col1': np.random.randn(3), + 'col2': np.random.randn(3)}, index=['a', 'b', 'c']) - In [0]: df = pd.DataFrame({'col1' : np.random.randn(3), 'col2' : np.random.randn(3)}, - ...: index=['a', 'b', 'c']) + for col in df: + print(col) - In [0]: for col in df: - ...: print(col) - ...: Pandas objects also have the dict-like :meth:`~DataFrame.iteritems` method to iterate over the (key, value) pairs. @@ -1558,12 +1549,11 @@ through key-value pairs: For example: -.. ipython:: +.. ipython:: python - In [0]: for item, frame in wp.iteritems(): - ...: print(item) - ...: print(frame) - ...: + for item, frame in wp.iteritems(): + print(item) + print(frame) .. _basics.iterrows: @@ -1574,11 +1564,10 @@ iterrows DataFrame as Series objects. It returns an iterator yielding each index value along with a Series containing the data in each row: -.. ipython:: +.. ipython:: python - In [0]: for row_index, row in df.iterrows(): - ...: print('%s\n%s' % (row_index, row)) - ...: + for row_index, row in df.iterrows(): + print(row_index, row, sep='\n') .. note:: @@ -1613,7 +1602,7 @@ For instance, a contrived way to transpose the DataFrame would be: print(df2) print(df2.T) - df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows())) + df2_t = pd.DataFrame({idx: values for idx, values in df2.iterrows()}) print(df2_t) itertuples @@ -1664,7 +1653,7 @@ This enables nice expressions like this: .. ipython:: python - s[s.dt.day==2] + s[s.dt.day == 2] You can easily produces tz aware transformations: @@ -1760,9 +1749,10 @@ used to sort a pandas object by its index levels. .. ipython:: python - df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame({ + 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), + 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), + 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], columns=['three', 'two', 'one']) @@ -1788,14 +1778,16 @@ to use to determine the sorted order. .. ipython:: python - df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) + df1 = pd.DataFrame({'one': [2, 1, 1, 1], + 'two': [1, 3, 2, 4], + 'three': [5, 4, 3, 2]}) df1.sort_values(by='two') The ``by`` parameter can take a list of column names, e.g.: .. ipython:: python - df1[['one', 'two', 'three']].sort_values(by=['one','two']) + df1[['one', 'two', 'three']].sort_values(by=['one', 'two']) These methods have special treatment of NA values via the ``na_position`` argument: @@ -1898,8 +1890,10 @@ all levels to ``by``. .. ipython:: python - df1.columns = pd.MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) - df1.sort_values(by=('a','two')) + df1.columns = pd.MultiIndex.from_tuples([('a', 'one'), + ('a', 'two'), + ('b', 'three')]) + df1.sort_values(by=('a', 'two')) Copying @@ -1925,17 +1919,29 @@ dtypes ------ For the most part, pandas uses NumPy arrays and dtypes for Series or individual -columns of a DataFrame. The main types allowed in pandas objects are ``float``, -``int``, ``bool``, and ``datetime64[ns]`` (note that NumPy does not support -timezone-aware datetimes). - -In addition to NumPy's types, pandas :ref:`extends ` -NumPy's type-system for a few cases. - -* :ref:`Categorical ` -* :ref:`Datetime with Timezone ` -* :ref:`Period ` -* :ref:`Interval ` +columns of a DataFrame. NumPy provides support for ``float``, +``int``, ``bool``, ``timedelta64[ns]`` and ``datetime64[ns]`` (note that NumPy +does not support timezone-aware datetimes). + +Pandas and third-party libraries *extend* NumPy's type system in a few places. +This section describes the extensions pandas has made internally. +See :ref:`extending.extension-types` for how to write your own extension that +works with pandas. See :ref:`ecosystem.extensions` for a list of third-party +libraries that have implemented an extension. + +The following table lists all of pandas extension types. See the respective +documentation sections for more on each type. + +=================== ========================= ================== ============================= ============================= +Kind of Data Data Type Scalar Array Documentation +=================== ========================= ================== ============================= ============================= +tz-aware datetime :class:`DatetimeArray` :class:`Timestamp` :class:`arrays.DatetimeArray` :ref:`timeseries.timezone` +Categorical :class:`CategoricalDtype` (none) :class:`Categorical` :ref:`categorical` +period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays.PeriodArray` :ref:`timeseries.periods` +sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` +intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` +nullable integer :clsas:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` +=================== ========================= ================== ============================= ============================= Pandas uses the ``object`` dtype for storing strings. @@ -1948,13 +1954,13 @@ with the data type of each column. .. ipython:: python - dft = pd.DataFrame(dict(A = np.random.rand(3), - B = 1, - C = 'foo', - D = pd.Timestamp('20010102'), - E = pd.Series([1.0]*3).astype('float32'), - F = False, - G = pd.Series([1]*3,dtype='int8'))) + dft = pd.DataFrame({'A': np.random.rand(3), + 'B': 1, + 'C': 'foo', + 'D': pd.Timestamp('20010102'), + 'E': pd.Series([1.0] * 3).astype('float32'), + 'F': False, + 'G': pd.Series([1] * 3, dtype='int8')}) dft dft.dtypes @@ -1993,9 +1999,10 @@ different numeric dtypes will **NOT** be combined. The following example will gi df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') df1 df1.dtypes - df2 = pd.DataFrame(dict( A = pd.Series(np.random.randn(8), dtype='float16'), - B = pd.Series(np.random.randn(8)), - C = pd.Series(np.array(np.random.randn(8), dtype='uint8')) )) + df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), + 'B': pd.Series(np.random.randn(8)), + 'C': pd.Series(np.array(np.random.randn(8), + dtype='uint8'))}) df2 df2.dtypes @@ -2010,7 +2017,7 @@ The following will all result in ``int64`` dtypes. pd.DataFrame([1, 2], columns=['a']).dtypes pd.DataFrame({'a': [1, 2]}).dtypes - pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes + pd.DataFrame({'a': 1}, index=list(range(2))).dtypes Note that Numpy will choose *platform-dependent* types when creating arrays. The following **WILL** result in ``int32`` on 32-bit platform. @@ -2032,13 +2039,13 @@ from the current type (e.g. ``int`` to ``float``). df3 df3.dtypes -The ``values`` attribute on a DataFrame return the *lower-common-denominator* of the dtypes, meaning +:meth:`DataFrame.to_numpy` will return the *lower-common-denominator* of the dtypes, meaning the dtype that can accommodate **ALL** of the types in the resulting homogeneous dtyped NumPy array. This can force some *upcasting*. .. ipython:: python - df3.values.dtype + df3.to_numpy().dtype astype ~~~~~~ @@ -2065,8 +2072,8 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`. .. ipython:: python - dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) - dft[['a','b']] = dft[['a','b']].astype(np.uint8) + dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) + dft[['a', 'b']] = dft[['a', 'b']].astype(np.uint8) dft dft.dtypes @@ -2076,7 +2083,7 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft1 = pd.DataFrame({'a': [1,0,1], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c': [7, 8, 9]}) dft1 = dft1.astype({'a': np.bool, 'c': np.float64}) dft1 dft1.dtypes @@ -2089,7 +2096,7 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) dft.loc[:, ['a', 'b']].astype(np.uint8).dtypes dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) dft.dtypes @@ -2109,7 +2116,8 @@ to the correct type. import datetime df = pd.DataFrame([[1, 2], ['a', 'b'], - [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]]) + [datetime.datetime(2016, 3, 2), + datetime.datetime(2016, 3, 2)]]) df = df.T df df.dtypes @@ -2197,7 +2205,8 @@ as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the .. ipython:: python import datetime - df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') + df = pd.DataFrame([ + ['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') df df.apply(pd.to_datetime) @@ -2223,7 +2232,7 @@ See also :ref:`Support for integer NA `. dfi dfi.dtypes - casted = dfi[dfi>0] + casted = dfi[dfi > 0] casted casted.dtypes @@ -2235,7 +2244,7 @@ While float dtypes are unchanged. dfa['A'] = dfa['A'].astype('float32') dfa.dtypes - casted = dfa[df2>0] + casted = dfa[df2 > 0] casted casted.dtypes @@ -2258,11 +2267,11 @@ dtypes: 'float64': np.arange(4.0, 7.0), 'bool1': [True, False, True], 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3).values, + 'dates': pd.date_range('now', periods=3), 'category': pd.Series(list("ABC")).astype('category')}) df['tdeltas'] = df.dates.diff() df['uint64'] = np.arange(3, 6).astype('u8') - df['other_dates'] = pd.date_range('20130101', periods=3).values + df['other_dates'] = pd.date_range('20130101', periods=3) df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') df @@ -2283,7 +2292,7 @@ For example, to select ``bool`` columns: df.select_dtypes(include=[bool]) You can also pass the name of a dtype in the `NumPy dtype hierarchy -`__: +`__: .. ipython:: python @@ -2324,4 +2333,4 @@ All NumPy dtypes are subclasses of ``numpy.generic``: .. note:: Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal - NumPy hierarchy and won't show up with the above function. + NumPy hierarchy and won't show up with the above function. \ No newline at end of file diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index acab9de905540..31f2430e4be88 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -60,14 +60,14 @@ By specifying ``dtype="category"`` when constructing a ``Series``: .. ipython:: python - s = pd.Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a", "b", "c", "a"], dtype="category") s By converting an existing ``Series`` or column to a ``category`` dtype: .. ipython:: python - df = pd.DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) df["B"] = df["A"].astype('category') df @@ -86,11 +86,11 @@ By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], + raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False) s = pd.Series(raw_cat) s - df = pd.DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) df["B"] = raw_cat df @@ -155,7 +155,6 @@ of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python from pandas.api.types import CategoricalDtype - s = pd.Series(["a", "b", "c", "a"]) cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True) @@ -167,6 +166,7 @@ are consistent among all columns. .. ipython:: python + from pandas.api.types import CategoricalDtype df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) cat_type = CategoricalDtype(categories=list('abcd'), ordered=True) @@ -178,7 +178,7 @@ are consistent among all columns. To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as categories for each column, the ``categories`` parameter can be determined programmatically by - ``categories = pd.unique(df.values.ravel())``. + ``categories = pd.unique(df.to_numpy().ravel())``. If you already have ``codes`` and ``categories``, you can use the :func:`~pandas.Categorical.from_codes` constructor to save the factorize step @@ -186,8 +186,9 @@ during normal constructor mode: .. ipython:: python - splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) + splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5]) + s = pd.Series(pd.Categorical.from_codes(splitter, + categories=["train", "test"])) Regaining Original Data @@ -198,7 +199,7 @@ To get back to the original ``Series`` or NumPy array, use .. ipython:: python - s = pd.Series(["a","b","c","a"]) + s = pd.Series(["a", "b", "c", "a"]) s s2 = s.astype('category') s2 @@ -236,7 +237,6 @@ by default. .. ipython:: python from pandas.api.types import CategoricalDtype - CategoricalDtype(['a', 'b', 'c']) CategoricalDtype(['a', 'b', 'c'], ordered=True) CategoricalDtype() @@ -268,7 +268,7 @@ unordered categoricals, the order of the ``categories`` is not considered. c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) # Unequal, since the second CategoricalDtype is ordered - c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) All instances of ``CategoricalDtype`` compare equal to the string ``'category'``. @@ -293,7 +293,7 @@ output to a ``Series`` or ``DataFrame`` of type ``string``. .. ipython:: python cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"]) - df = pd.DataFrame({"cat":cat, "s":["a", "c", "c", np.nan]}) + df = pd.DataFrame({"cat": cat, "s": ["a", "c", "c", np.nan]}) df.describe() df["cat"].describe() @@ -309,7 +309,7 @@ specify categories and ordering, they are inferred from the passed arguments. .. ipython:: python - s = pd.Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a", "b", "c", "a"], dtype="category") s.cat.categories s.cat.ordered @@ -317,7 +317,8 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "a"], + categories=["c", "b", "a"])) s.cat.categories s.cat.ordered @@ -354,11 +355,11 @@ Renaming categories is done by assigning new values to the .. ipython:: python - s = pd.Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a", "b", "c", "a"], dtype="category") s s.cat.categories = ["Group %s" % g for g in s.cat.categories] s - s = s.cat.rename_categories([1,2,3]) + s = s.cat.rename_categories([1, 2, 3]) s # You can also pass a dict-like object to map the renaming s = s.cat.rename_categories({1: 'x', 2: 'y', 3: 'z'}) @@ -378,18 +379,18 @@ Categories must be unique or a `ValueError` is raised: .. ipython:: python try: - s.cat.categories = [1,1,1] + s.cat.categories = [1, 1, 1] except ValueError as e: - print("ValueError: " + str(e)) + print("ValueError:", str(e)) Categories must also not be ``NaN`` or a `ValueError` is raised: .. ipython:: python try: - s.cat.categories = [1,2,np.nan] + s.cat.categories = [1, 2, np.nan] except ValueError as e: - print("ValueError: " + str(e)) + print("ValueError:", str(e)) Appending new categories ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -422,7 +423,8 @@ Removing unused categories can also be done: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","a"], categories=["a","b","c","d"])) + s = pd.Series(pd.Categorical(["a", "b", "a"], + categories=["a", "b", "c", "d"])) s s.cat.remove_unused_categories() @@ -436,9 +438,9 @@ use :meth:`~pandas.Categorical.set_categories`. .. ipython:: python - s = pd.Series(["one","two","four", "-"], dtype="category") + s = pd.Series(["one", "two", "four", "-"], dtype="category") s - s = s.cat.set_categories(["one","two","three","four"]) + s = s.cat.set_categories(["one", "two", "three", "four"]) s .. note:: @@ -456,9 +458,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) + s = pd.Series(pd.Categorical(["a", "b", "c", "a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a","b","c","a"]).astype( + s = pd.Series(["a", "b", "c", "a"]).astype( CategoricalDtype(ordered=True) ) s.sort_values(inplace=True) @@ -478,8 +480,8 @@ This is even true for strings and numeric data: .. ipython:: python - s = pd.Series([1,2,3,1], dtype="category") - s = s.cat.set_categories([2,3,1], ordered=True) + s = pd.Series([1, 2, 3, 1], dtype="category") + s = s.cat.set_categories([2, 3, 1], ordered=True) s s.sort_values(inplace=True) s @@ -496,8 +498,8 @@ necessarily make the sort order the same as the categories order. .. ipython:: python - s = pd.Series([1,2,3,1], dtype="category") - s = s.cat.reorder_categories([2,3,1], ordered=True) + s = pd.Series([1, 2, 3, 1], dtype="category") + s = s.cat.reorder_categories([2, 3, 1], ordered=True) s s.sort_values(inplace=True) s @@ -526,16 +528,18 @@ The ordering of the categorical is determined by the ``categories`` of that colu .. ipython:: python - dfs = pd.DataFrame({'A' : pd.Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True), - 'B' : [1,2,1,2,2,1,2,1] }) + dfs = pd.DataFrame({'A': pd.Categorical(list('bbeebbaa'), + categories=['e', 'a', 'b'], + ordered=True), + 'B': [1, 2, 1, 2, 2, 1, 2, 1]}) dfs.sort_values(by=['A', 'B']) Reordering the ``categories`` changes a future sort. .. ipython:: python - dfs['A'] = dfs['A'].cat.reorder_categories(['a','b','e']) - dfs.sort_values(by=['A','B']) + dfs['A'] = dfs['A'].cat.reorder_categories(['a', 'b', 'e']) + dfs.sort_values(by=['A', 'B']) Comparisons ----------- @@ -560,13 +564,13 @@ categories or a categorical with any list-like object, will raise a ``TypeError` .. ipython:: python - cat = pd.Series([1,2,3]).astype( + cat = pd.Series([1, 2, 3]).astype( CategoricalDtype([3, 2, 1], ordered=True) ) - cat_base = pd.Series([2,2,2]).astype( + cat_base = pd.Series([2, 2, 2]).astype( CategoricalDtype([3, 2, 1], ordered=True) ) - cat_base2 = pd.Series([2,2,2]).astype( + cat_base2 = pd.Series([2, 2, 2]).astype( CategoricalDtype(ordered=True) ) @@ -586,7 +590,7 @@ Equality comparisons work with any list-like object of same length and scalars: .. ipython:: python cat == cat_base - cat == np.array([1,2,3]) + cat == np.array([1, 2, 3]) cat == 2 This doesn't work because the categories are not the same: @@ -596,7 +600,7 @@ This doesn't work because the categories are not the same: try: cat > cat_base2 except TypeError as e: - print("TypeError: " + str(e)) + print("TypeError:", str(e)) If you want to do a "non-equality" comparison of a categorical series with a list-like object which is not categorical data, you need to be explicit and convert the categorical data back to @@ -604,12 +608,12 @@ the original values: .. ipython:: python - base = np.array([1,2,3]) + base = np.array([1, 2, 3]) try: cat > base except TypeError as e: - print("TypeError: " + str(e)) + print("TypeError:", str(e)) np.asarray(cat) > base @@ -632,28 +636,34 @@ even if some categories are not present in the data: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","c"], categories=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "c"], + categories=["c", "a", "b", "d"])) s.value_counts() Groupby will also show "unused" categories: .. ipython:: python - cats = pd.Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) - df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) + cats = pd.Categorical(["a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"]) + df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]}) df.groupby("cats").mean() - cats2 = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) - df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) - df2.groupby(["cats","B"]).mean() + cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) + df2 = pd.DataFrame({"cats": cats2, + "B": ["c", "d", "c", "d"], + "values": [1, 2, 3, 4]}) + df2.groupby(["cats", "B"]).mean() Pivot tables: .. ipython:: python - raw_cat = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) - df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) + raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) + df = pd.DataFrame({"A": raw_cat, + "B": ["c", "d", "c", "d"], + "values": [1, 2, 3, 4]}) pd.pivot_table(df, values='values', index=['A', 'B']) Data munging @@ -671,13 +681,14 @@ If the slicing operation returns either a ``DataFrame`` or a column of type .. ipython:: python - idx = pd.Index(["h","i","j","k","l","m","n",]) - cats = pd.Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) - values= [1,2,2,2,3,4,5] - df = pd.DataFrame({"cats":cats,"values":values}, index=idx) - df.iloc[2:4,:] - df.iloc[2:4,:].dtypes - df.loc["h":"j","cats"] + idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) + cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], + dtype="category", index=idx) + values = [1, 2, 2, 2, 3, 4, 5] + df = pd.DataFrame({"cats": cats, "values": values}, index=idx) + df.iloc[2:4, :] + df.iloc[2:4, :].dtypes + df.loc["h":"j", "cats"] df[df["cats"] == "b"] An example where the category type is not preserved is if you take one single @@ -693,9 +704,9 @@ of length "1". .. ipython:: python - df.iat[0,0] - df["cats"].cat.categories = ["x","y","z"] - df.at["h","cats"] # returns a string + df.iat[0, 0] + df["cats"].cat.categories = ["x", "y", "z"] + df.at["h", "cats"] # returns a string .. note:: The is in contrast to R's `factor` function, where ``factor(c(1,2,3))[1]`` @@ -706,7 +717,7 @@ a single value: .. ipython:: python - df.loc[["h"],"cats"] + df.loc[["h"], "cats"] String and datetime accessors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -760,36 +771,38 @@ value is included in the `categories`: .. ipython:: python - idx = pd.Index(["h","i","j","k","l","m","n"]) - cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) - values = [1,1,1,1,1,1,1] - df = pd.DataFrame({"cats":cats,"values":values}, index=idx) + idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) + cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], + categories=["a", "b"]) + values = [1, 1, 1, 1, 1, 1, 1] + df = pd.DataFrame({"cats": cats, "values": values}, index=idx) - df.iloc[2:4,:] = [["b",2],["b",2]] + df.iloc[2:4, :] = [["b", 2], ["b", 2]] df try: - df.iloc[2:4,:] = [["c",3],["c",3]] + df.iloc[2:4, :] = [["c", 3], ["c", 3]] except ValueError as e: - print("ValueError: " + str(e)) + print("ValueError:", str(e)) Setting values by assigning categorical data will also check that the `categories` match: .. ipython:: python - df.loc["j":"k","cats"] = pd.Categorical(["a","a"], categories=["a","b"]) + df.loc["j":"k", "cats"] = pd.Categorical(["a", "a"], categories=["a", "b"]) df try: - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) + df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], + categories=["a", "b", "c"]) except ValueError as e: - print("ValueError: " + str(e)) + print("ValueError:", str(e)) Assigning a ``Categorical`` to parts of a column of other types will use the values: .. ipython:: python - df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) - df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) - df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) + df = pd.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + df.loc[1:2, "a"] = pd.Categorical(["b", "b"], categories=["a", "b"]) + df.loc[2:3, "b"] = pd.Categorical(["b", "b"], categories=["a", "b"]) df df.dtypes @@ -803,10 +816,10 @@ but the categories of these categoricals need to be the same: .. ipython:: python - cat = pd.Series(["a","b"], dtype="category") - vals = [1,2] - df = pd.DataFrame({"cats":cat, "vals":vals}) - res = pd.concat([df,df]) + cat = pd.Series(["a", "b"], dtype="category") + vals = [1, 2] + df = pd.DataFrame({"cats": cat, "vals": vals}) + res = pd.concat([df, df]) res res.dtypes @@ -815,11 +828,11 @@ In this case the categories are not the same, and therefore an error is raised: .. ipython:: python df_different = df.copy() - df_different["cats"].cat.categories = ["c","d"] + df_different["cats"].cat.categories = ["c", "d"] try: - pd.concat([df,df_different]) + pd.concat([df, df_different]) except ValueError as e: - print("ValueError: " + str(e)) + print("ValueError:", str(e)) The same applies to ``df.append(df_different)``. @@ -942,7 +955,7 @@ Use ``.astype`` or ``union_categoricals`` to get ``category`` result. pd.concat([s1, s3]) pd.concat([s1, s3]).astype('category') - union_categoricals([s1.values, s3.values]) + union_categoricals([s1.array, s3.array]) Following table summarizes the results of ``Categoricals`` related concatenations. @@ -976,16 +989,16 @@ relevant columns back to `category` and assign the right categories and categori .. ipython:: python :suppress: - from pandas.compat import StringIO .. ipython:: python + from pandas.compat import StringIO s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) # rename the categories s.cat.categories = ["very good", "good", "bad"] # reorder the categories and add missing categories s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) - df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) + df = pd.DataFrame({"cats": s, "vals": [1, 2, 3, 4, 5, 6]}) csv = StringIO() df.to_csv(csv) df2 = pd.read_csv(StringIO(csv.getvalue())) @@ -993,7 +1006,8 @@ relevant columns back to `category` and assign the right categories and categori df2["cats"] # Redo the category df2["cats"] = df2["cats"].astype("category") - df2["cats"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"], + df2["cats"].cat.set_categories(["very bad", "bad", "medium", + "good", "very good"], inplace=True) df2.dtypes df2["cats"] @@ -1062,7 +1076,7 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = pd.Series(['foo','bar']*1000) + s = pd.Series(['foo', 'bar'] * 1000) # object dtype s.nbytes @@ -1099,13 +1113,13 @@ NumPy itself doesn't know about the new `dtype`: try: np.dtype("category") except TypeError as e: - print("TypeError: " + str(e)) + print("TypeError:", str(e)) dtype = pd.Categorical(["a"]).dtype try: np.dtype(dtype) except TypeError as e: - print("TypeError: " + str(e)) + print("TypeError:", str(e)) Dtype comparisons work: @@ -1126,12 +1140,12 @@ are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python - s = pd.Series(pd.Categorical([1,2,3,4])) + s = pd.Series(pd.Categorical([1, 2, 3, 4])) try: np.sum(s) - #same with np.log(s),.. + # same with np.log(s),... except TypeError as e: - print("TypeError: " + str(e)) + print("TypeError:", str(e)) .. note:: If such a function works, please file a bug at https://github.com/pandas-dev/pandas! @@ -1145,9 +1159,9 @@ basic type) and applying along columns will also convert to object. .. ipython:: python - df = pd.DataFrame({"a":[1,2,3,4], - "b":["a","b","c","d"], - "cats":pd.Categorical([1,2,3,2])}) + df = pd.DataFrame({"a": [1, 2, 3, 4], + "b": ["a", "b", "c", "d"], + "cats": pd.Categorical([1, 2, 3, 2])}) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) @@ -1164,10 +1178,10 @@ Setting the index will create a ``CategoricalIndex``: .. ipython:: python - cats = pd.Categorical([1,2,3,4], categories=[4,2,3,1]) - strings = ["a","b","c","d"] - values = [4,2,3,1] - df = pd.DataFrame({"strings":strings, "values":values}, index=cats) + cats = pd.Categorical([1, 2, 3, 4], categories=[4, 2, 3, 1]) + strings = ["a", "b", "c", "d"] + values = [4, 2, 3, 1] + df = pd.DataFrame({"strings": strings, "values": values}, index=cats) df.index # This now sorts by the categories order df.sort_index() @@ -1181,20 +1195,20 @@ change the original ``Categorical``: .. ipython:: python - cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) s = pd.Series(cat, name="cat") cat s.iloc[0:2] = 10 cat df = pd.DataFrame(s) - df["cat"].cat.categories = [1,2,3,4,5] + df["cat"].cat.categories = [1, 2, 3, 4, 5] cat Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categoricals``: .. ipython:: python - cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) s = pd.Series(cat, name="cat", copy=True) cat s.iloc[0:2] = 10 diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index eecacde8ad14e..704b0c4d80537 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -6,7 +6,7 @@ import pandas as pd import numpy as np - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 Comparison with R / R libraries ******************************* @@ -165,16 +165,15 @@ function. .. ipython:: python - df = pd.DataFrame({ - 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], - 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, - np.nan] - }) + df = pd.DataFrame( + {'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, + np.nan]}) - g = df.groupby(['by1','by2']) - g[['v1','v2']].mean() + g = df.groupby(['by1', 'by2']) + g[['v1', 'v2']].mean() For more details and examples see :ref:`the groupby documentation `. @@ -195,7 +194,7 @@ The :meth:`~pandas.DataFrame.isin` method is similar to R ``%in%`` operator: .. ipython:: python - s = pd.Series(np.arange(5),dtype=np.float32) + s = pd.Series(np.arange(5), dtype=np.float32) s.isin([2, 4]) The ``match`` function returns a vector of the positions of matches @@ -234,11 +233,11 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import random import string - baseball = pd.DataFrame({ - 'team': ["team %d" % (x+1) for x in range(5)]*5, - 'player': random.sample(list(string.ascii_lowercase),25), - 'batting avg': np.random.uniform(.200, .400, 25) - }) + baseball = pd.DataFrame( + {'team': ["team %d" % (x + 1) for x in range(5)] * 5, + 'player': random.sample(list(string.ascii_lowercase), 25), + 'batting avg': np.random.uniform(.200, .400, 25)}) + baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) For more details and examples see :ref:`the reshaping documentation @@ -341,15 +340,13 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 120), - 'y': np.random.uniform(7., 334., 120), - 'z': np.random.uniform(1.7, 20.7, 120), - 'month': [5,6,7,8]*30, - 'week': np.random.randint(1,4, 120) - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 120), + 'y': np.random.uniform(7., 334., 120), + 'z': np.random.uniform(1.7, 20.7, 120), + 'month': [5, 6, 7, 8] * 30, + 'week': np.random.randint(1, 4, 120)}) - grouped = df.groupby(['month','week']) + grouped = df.groupby(['month', 'week']) grouped['x'].agg([np.mean, np.std]) @@ -374,8 +371,8 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python - a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4) - pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) + a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) + pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) |meltlist|_ ~~~~~~~~~~~~ @@ -393,7 +390,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python - a = list(enumerate(list(range(1,5))+[np.NAN])) + a = list(enumerate(list(range(1, 5)) + [np.NAN])) pd.DataFrame(a) For more details and examples see :ref:`the Into to Data Structures @@ -419,12 +416,13 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = pd.DataFrame({'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + cheese = pd.DataFrame({'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) + pd.melt(cheese, id_vars=['first', 'last']) - cheese.set_index(['first', 'last']).stack() # alternative way + cheese.set_index(['first', 'last']).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. @@ -452,16 +450,15 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 12), - 'y': np.random.uniform(7., 334., 12), - 'z': np.random.uniform(1.7, 20.7, 12), - 'month': [5,6,7]*4, - 'week': [1,2]*6 - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 12), + 'y': np.random.uniform(7., 334., 12), + 'z': np.random.uniform(1.7, 20.7, 12), + 'month': [5, 6, 7] * 4, + 'week': [1, 2] * 6}) + mdf = pd.melt(df, id_vars=['month', 'week']) - pd.pivot_table(mdf, values='value', index=['variable','week'], - columns=['month'], aggfunc=np.mean) + pd.pivot_table(mdf, values='value', index=['variable', 'week'], + columns=['month'], aggfunc=np.mean) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to aggregate information based on ``Animal`` and ``FeedType``: @@ -491,13 +488,14 @@ using :meth:`~pandas.pivot_table`: 'Amount': [10, 7, 4, 2, 5, 6, 2], }) - df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum') + df.pivot_table(values='Amount', index='Animal', columns='FeedType', + aggfunc='sum') The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: .. ipython:: python - df.groupby(['Animal','FeedType'])['Amount'].sum() + df.groupby(['Animal', 'FeedType'])['Amount'].sum() For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. @@ -516,8 +514,8 @@ In pandas this is accomplished with ``pd.cut`` and ``astype("category")``: .. ipython:: python - pd.cut(pd.Series([1,2,3,4,5,6]), 3) - pd.Series([1,2,3,2,2,3]).astype("category") + pd.cut(pd.Series([1, 2, 3, 4, 5, 6]), 3) + pd.Series([1, 2, 3, 2, 2, 3]).astype("category") For more details and examples see :ref:`categorical introduction ` and the :ref:`API documentation `. There is also a documentation regarding the diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index 0354ad473544b..c4d121c10538c 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -105,9 +105,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({ - 'x': [1, 3, 5], - 'y': [2, 4, 6]}) + df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) df @@ -131,7 +129,8 @@ The pandas method is :func:`read_csv`, which works similarly. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev/' + 'pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -289,17 +288,17 @@ see the :ref:`timeseries documentation` for more details. tips['date1_year'] = tips['date1'].dt.year tips['date2_month'] = tips['date2'].dt.month tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) + tips['months_between'] = ( + tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M')) - tips[['date1','date2','date1_year','date2_month', - 'date1_next','months_between']].head() + tips[['date1', 'date2', 'date1_year', 'date2_month', + 'date1_next', 'months_between']].head() .. ipython:: python :suppress: - tips = tips.drop(['date1','date2','date1_year', - 'date2_month','date1_next','months_between'], axis=1) + tips = tips.drop(['date1', 'date2', 'date1_year', + 'date2_month', 'date1_next', 'months_between'], axis=1) Selection of Columns ~~~~~~~~~~~~~~~~~~~~ @@ -335,7 +334,7 @@ The same operations are expressed in pandas below. tips.drop('sex', axis=1).head() # rename - tips.rename(columns={'total_bill':'total_bill_2'}).head() + tips.rename(columns={'total_bill': 'total_bill_2'}).head() Sorting by Values @@ -365,8 +364,8 @@ Length ~~~~~~ SAS determines the length of a character string with the -`LENGTHN `__ -and `LENGTHC `__ +`LENGTHN `__ +and `LENGTHC `__ functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailing blanks. .. code-block:: sas @@ -391,7 +390,7 @@ Find ~~~~ SAS determines the position of a character in a string with the -`FINDW `__ function. +`FINDW `__ function. ``FINDW`` takes the string defined by the first argument and searches for the first position of the substring you supply as the second argument. @@ -417,7 +416,7 @@ Substring ~~~~~~~~~ SAS extracts a substring from a string based on its position with the -`SUBSTR `__ function. +`SUBSTR `__ function. .. code-block:: sas @@ -438,7 +437,7 @@ indexes are zero-based. Scan ~~~~ -The SAS `SCAN `__ +The SAS `SCAN `__ function returns the nth word from a string. The first argument is the string you want to parse and the second argument specifies which word you want to extract. @@ -469,9 +468,9 @@ approaches, but this just shows a simple approach. Upcase, Lowcase, and Propcase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The SAS `UPCASE `__ -`LOWCASE `__ and -`PROPCASE `__ +The SAS `UPCASE `__ +`LOWCASE `__ and +`PROPCASE `__ functions change the case of the argument. .. code-block:: sas @@ -508,7 +507,7 @@ The following tables will be used in the merge examples 'value': np.random.randn(4)}) df1 df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + 'value': np.random.randn(4)}) df2 In SAS, data must be explicitly sorted before merging. Different @@ -695,7 +694,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex','smoker']).first() + tips.groupby(['sex', 'smoker']).first() Other Considerations @@ -709,7 +708,7 @@ This means that the size of data able to be loaded in pandas is limited by your machine's memory, but also that the operations on that data may be faster. If out of core processing is needed, one possibility is the -`dask.dataframe `_ +`dask.dataframe `_ library (currently in development) which provides a subset of pandas functionality for an on-disk ``DataFrame`` @@ -744,7 +743,7 @@ XPORT is a relatively limited format and the parsing of it is not as optimized as some of the other pandas readers. An alternative way to interop data between SAS and pandas is to serialize to csv. -.. code-block:: python +.. code-block:: ipython # version 0.17, 10M rows diff --git a/doc/source/comparison_with_sql.rst b/doc/source/comparison_with_sql.rst index ba069b5a44c72..021f37eb5c66f 100644 --- a/doc/source/comparison_with_sql.rst +++ b/doc/source/comparison_with_sql.rst @@ -4,7 +4,7 @@ Comparison with SQL ******************** Since many potential pandas users have some familiarity with -`SQL `_, this page is meant to provide some examples of how +`SQL `_, this page is meant to provide some examples of how various SQL operations would be performed using pandas. If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>` @@ -23,7 +23,8 @@ structure. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -59,7 +60,7 @@ Filtering in SQL is done via a WHERE clause. LIMIT 5; DataFrames can be filtered in multiple ways; the most intuitive of which is using -`boolean indexing `_. +`boolean indexing `_. .. ipython:: python @@ -387,7 +388,7 @@ Top N rows with offset .. ipython:: python - tips.nlargest(10+5, columns='tip').tail(10) + tips.nlargest(10 + 5, columns='tip').tail(10) Top N rows per group ~~~~~~~~~~~~~~~~~~~~ @@ -411,8 +412,7 @@ Top N rows per group .groupby(['day']) .cumcount() + 1) .query('rn < 3') - .sort_values(['day','rn']) - ) + .sort_values(['day', 'rn'])) the same using `rank(method='first')` function @@ -421,8 +421,7 @@ the same using `rank(method='first')` function (tips.assign(rnk=tips.groupby(['day'])['total_bill'] .rank(method='first', ascending=False)) .query('rnk < 3') - .sort_values(['day','rnk']) - ) + .sort_values(['day', 'rnk'])) .. code-block:: sql @@ -445,11 +444,10 @@ Notice that when using ``rank(method='min')`` function .. ipython:: python (tips[tips['tip'] < 2] - .assign(rnk_min=tips.groupby(['sex'])['tip'] - .rank(method='min')) - .query('rnk_min < 3') - .sort_values(['sex','rnk_min']) - ) + .assign(rnk_min=tips.groupby(['sex'])['tip'] + .rank(method='min')) + .query('rnk_min < 3') + .sort_values(['sex', 'rnk_min'])) UPDATE diff --git a/doc/source/comparison_with_stata.rst b/doc/source/comparison_with_stata.rst index 6c518983d5904..e039843b22065 100644 --- a/doc/source/comparison_with_stata.rst +++ b/doc/source/comparison_with_stata.rst @@ -102,9 +102,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({ - 'x': [1, 3, 5], - 'y': [2, 4, 6]}) + df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) df @@ -128,7 +126,8 @@ the data set if presented with a url. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -278,17 +277,17 @@ see the :ref:`timeseries documentation` for more details. tips['date1_year'] = tips['date1'].dt.year tips['date2_month'] = tips['date2'].dt.month tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) + tips['months_between'] = (tips['date2'].dt.to_period('M') + - tips['date1'].dt.to_period('M')) - tips[['date1','date2','date1_year','date2_month', - 'date1_next','months_between']].head() + tips[['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', + 'months_between']].head() .. ipython:: python :suppress: - tips = tips.drop(['date1','date2','date1_year', - 'date2_month','date1_next','months_between'], axis=1) + tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', + 'date1_next', 'months_between'], axis=1) Selection of Columns ~~~~~~~~~~~~~~~~~~~~ @@ -472,7 +471,7 @@ The following tables will be used in the merge examples 'value': np.random.randn(4)}) df1 df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + 'value': np.random.randn(4)}) df2 In Stata, to perform a merge, one data set must be in memory @@ -661,7 +660,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex','smoker']).first() + tips.groupby(['sex', 'smoker']).first() Other Considerations diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 0d2021de8f88e..251dce5141ea5 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -4,14 +4,15 @@ :suppress: import numpy as np + import matplotlib.pyplot as plt + + import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import pandas as pd - import matplotlib - # matplotlib.style.use('default') - import matplotlib.pyplot as plt + pd.options.display.max_rows = 15 + plt.close('all') - pd.options.display.max_rows=15 .. _computation: @@ -75,7 +76,8 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -127,7 +129,8 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.iloc[::2] = np.nan # Series with Series @@ -163,9 +166,10 @@ compute the correlation based on histogram intersection: .. ipython:: python # histogram intersection - histogram_intersection = lambda a, b: np.minimum( - np.true_divide(a, a.sum()), np.true_divide(b, b.sum()) - ).sum() + def histogram_intersection(a, b): + return np.minimum(np.true_divide(a, a.sum()), + np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to @@ -192,7 +196,7 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python s = pd.Series(np.random.np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s['d'] = s['b'] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -202,7 +206,7 @@ ranking. .. ipython:: python df = pd.DataFrame(np.random.np.random.randn(10, 6)) - df[4] = df[2][:5] # some ties + df[4] = df[2][:5] # some ties df df.rank(1) @@ -243,7 +247,8 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python - s = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + s = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) s = s.cumsum() s @@ -258,7 +263,7 @@ These object provide tab-completion of the available methods and properties. .. code-block:: ipython - In [14]: r. + In [14]: r. # noqa: E225, E999 r.agg r.apply r.count r.exclusions r.max r.median r.name r.skew r.sum r.aggregate r.corr r.cov r.kurt r.mean r.min r.quantile r.std r.var @@ -336,7 +341,9 @@ compute the mean absolute deviation on a rolling basis: .. ipython:: python - mad = lambda x: np.fabs(x - x.mean()).mean() + def mad(x): + return np.fabs(x - x.mean()).mean() + @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') @@ -376,7 +383,8 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python - ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), + index=pd.date_range('1/1/2000', periods=10)) ser.rolling(window=5, win_type='triang').mean() @@ -423,7 +431,9 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + index=pd.date_range('20130101 09:00:00', + periods=5, + freq='s')) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -445,12 +455,12 @@ Using a non-regular, but still monotonic index, rolling with an integer window d .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index = pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + index=pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) dft dft.rolling(2).sum() @@ -496,11 +506,11 @@ from present information back to past information. This allows the rolling windo .. ipython:: python df = pd.DataFrame({'x': 1}, - index = [pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) + index=[pd.Timestamp('20130101 09:00:01'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:04'), + pd.Timestamp('20130101 09:00:06')]) df["right"] = df.rolling('2s', closed='right').x.sum() # default df["both"] = df.rolling('2s', closed='both').x.sum() @@ -601,7 +611,8 @@ can even be omitted: .. ipython:: python - covs = df[['B','C','D']].rolling(window=50).cov(df[['A','B','C']], pairwise=True) + covs = (df[['B', 'C', 'D']].rolling(window=50) + .cov(df[['A', 'B', 'C']], pairwise=True)) covs.loc['2002-09-22':] .. ipython:: python @@ -637,7 +648,7 @@ perform multiple computations on the data. These operations are similar to the : dfa = pd.DataFrame(np.random.randn(1000, 3), index=pd.date_range('1/1/2000', periods=1000), columns=['A', 'B', 'C']) - r = dfa.rolling(window=60,min_periods=1) + r = dfa.rolling(window=60, min_periods=1) r We can aggregate by passing a function to the entire DataFrame, or select a @@ -649,7 +660,7 @@ Series (or multiple Series) via standard ``__getitem__``. r['A'].aggregate(np.sum) - r[['A','B']].aggregate(np.sum) + r[['A', 'B']].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -683,24 +694,21 @@ By passing a dict to ``aggregate`` you can apply a different aggregation to the columns of a ``DataFrame``: .. ipython:: python - :okexcept: - :okwarning: - r.agg({'A' : np.sum, - 'B' : lambda x: np.std(x, ddof=1)}) + r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A' : 'sum', 'B' : 'std'}) + r.agg({'A': 'sum', 'B': 'std'}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) .. _stats.moments.expanding: diff --git a/doc/source/conf.py b/doc/source/conf.py index 29f947e1144ea..d88b5e9757423 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -40,7 +40,6 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.append(os.path.abspath('.')) sys.path.insert(0, os.path.abspath('../sphinxext')) - sys.path.extend([ # numpy standard doc extensions @@ -75,6 +74,7 @@ 'sphinx.ext.ifconfig', 'sphinx.ext.linkcode', 'nbsphinx', + 'contributors', # custom pandas extension ] try: @@ -99,7 +99,7 @@ # JP: added from sphinxdocs autosummary_generate = False -if any(re.match("\s*api\s*", l) for l in index_rst_lines): +if any(re.match(r"\s*api\s*", l) for l in index_rst_lines): autosummary_generate = True # numpydoc @@ -120,7 +120,9 @@ templates_path = ['../_templates'] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = [ + '.rst', +] # The encoding of source files. source_encoding = 'utf-8' @@ -298,8 +300,26 @@ for page in moved_api_pages } + +common_imports = """\ +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + import pandas as pd + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + options.display.max_rows = 15 + from pandas.compat import StringIO +""" + + html_context = { - 'redirects': {old: new for old, new in moved_api_pages} + 'redirects': {old: new for old, new in moved_api_pages}, + 'common_imports': common_imports, } # If false, no module index is generated. @@ -341,8 +361,8 @@ # file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'pandas.tex', - u'pandas: powerful Python data analysis toolkit', - u'Wes McKinney\n\& PyData Development Team', 'manual'), + 'pandas: powerful Python data analysis toolkit', + r'Wes McKinney\n\& PyData Development Team', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -388,6 +408,7 @@ category=FutureWarning) +ipython_warning_is_error = False ipython_exec_lines = [ 'import numpy as np', 'import pandas as pd', @@ -565,19 +586,23 @@ def linkcode_resolve(domain, info): for part in fullname.split('.'): try: obj = getattr(obj, part) - except: + except AttributeError: return None try: - fn = inspect.getsourcefile(obj) - except: + # inspect.unwrap() was added in Python version 3.4 + if sys.version_info >= (3, 5): + fn = inspect.getsourcefile(inspect.unwrap(obj)) + else: + fn = inspect.getsourcefile(obj) + except TypeError: fn = None if not fn: return None try: source, lineno = inspect.getsourcelines(obj) - except: + except OSError: lineno = None if lineno: @@ -649,7 +674,23 @@ def process_class_docstrings(app, what, name, obj, options, lines): ] +def rstjinja(app, docname, source): + """ + Render our pages as a jinja template for fancy templating goodness. + """ + # http://ericholscher.com/blog/2016/jul/25/integrating-jinja-rst-sphinx/ + # Make sure we're outputting HTML + if app.builder.format != 'html': + return + src = source[0] + rendered = app.builder.templates.render_string( + src, app.config.html_context + ) + source[0] = rendered + + def setup(app): + app.connect("source-read", rstjinja) app.connect("autodoc-process-docstring", remove_flags_docstring) app.connect("autodoc-process-docstring", process_class_docstrings) app.add_autodocumenter(AccessorDocumenter) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 445f9a7e5e980..c55452cf27309 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -170,7 +170,7 @@ We'll now kick off a three-step process: .. code-block:: none # Create and activate the build environment - conda env create -f ci/environment-dev.yaml + conda env create -f environment.yml conda activate pandas-dev # or with older versions of Anaconda: @@ -180,9 +180,6 @@ We'll now kick off a three-step process: python setup.py build_ext --inplace -j 4 python -m pip install -e . - # Install the rest of the optional dependencies - conda install -c defaults -c conda-forge --file=ci/requirements-optional-conda.txt - At this point you should be able to import pandas from your locally built version:: $ python # start an interpreter @@ -221,14 +218,12 @@ You'll need to have at least python3.5 installed on your system. . ~/virtualenvs/pandas-dev/bin/activate # Install the build dependencies - python -m pip install -r ci/requirements_dev.txt + python -m pip install -r requirements-dev.txt + # Build and install pandas python setup.py build_ext --inplace -j 4 python -m pip install -e . - # Install additional dependencies - python -m pip install -r ci/requirements-optional-pip.txt - Creating a branch ----------------- @@ -497,6 +492,17 @@ tools will be run to check your code for stylistic errors. Generating any warnings will cause the test to fail. Thus, good style is a requirement for submitting code to *pandas*. +There is a tool in pandas to help contributors verify their changes before +contributing them to the project:: + + ./ci/code_checks.sh + +The script verify the linting of code files, it looks for common mistake patterns +(like missing spaces around sphinx directives that make the documentation not +being rendered properly) and it also validates the doctests. It is possible to +run the checks independently by using the parameters ``lint``, ``patterns`` and +``doctests`` (e.g. ``./ci/code_checks.sh lint``). + In addition, because a lot of people use our library, it is important that we do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* @@ -569,7 +575,7 @@ the `flake8 `_ tool and report any stylistic errors in your code. Therefore, it is helpful before submitting code to run the check yourself on the diff:: - git diff master -u -- "*.py" | flake8 --diff + git diff upstream/master -u -- "*.py" | flake8 --diff This command will catch any stylistic errors in your changes specifically, but be beware it may not catch all of them. For example, if you delete the only @@ -578,28 +584,69 @@ unused function. However, style-checking the diff will not catch this because the actual import is not part of the diff. Thus, for completeness, you should run this command, though it will take longer:: - git diff master --name-only -- "*.py" | grep "pandas/" | xargs -r flake8 + git diff upstream/master --name-only -- "*.py" | xargs -r flake8 Note that on OSX, the ``-r`` flag is not available, so you have to omit it and run this slightly modified command:: - git diff master --name-only -- "*.py" | grep "pandas/" | xargs flake8 + git diff upstream/master --name-only -- "*.py" | xargs flake8 + +Windows does not support the ``xargs`` command (unless installed for example +via the `MinGW `__ toolchain), but one can imitate the +behaviour as follows:: + + for /f %i in ('git diff upstream/master --name-only -- "*.py"') do flake8 %i + +This will get all the files being changed by the PR (and ending with ``.py``), +and run ``flake8`` on them, one after the other. + +.. _contributing.import-formatting: + +Import Formatting +~~~~~~~~~~~~~~~~~ +*pandas* uses `isort `__ to standardise import +formatting across the codebase. + +A guide to import layout as per pep8 can be found `here `__. + +A summary of our current import sections ( in order ): + +* Future +* Python Standard Library +* Third Party +* ``pandas._libs``, ``pandas.compat``, ``pandas.util._*``, ``pandas.errors`` (largely not dependent on ``pandas.core``) +* ``pandas.core.dtypes`` (largely not dependent on the rest of ``pandas.core``) +* Rest of ``pandas.core.*`` +* Non-core ``pandas.io``, ``pandas.plotting``, ``pandas.tseries`` +* Local application/library specific imports + +Imports are alphabetically sorted within these sections. + + +As part of :ref:`Continuous Integration ` checks we run:: -Note that on Windows, these commands are unfortunately not possible because -commands like ``grep`` and ``xargs`` are not available natively. To imitate the -behavior with the commands above, you should run:: + isort --recursive --check-only pandas - git diff master --name-only -- "*.py" +to check that imports are correctly formatted as per the `setup.cfg`. -This will list all of the Python files that have been modified. The only ones -that matter during linting are any whose directory filepath begins with "pandas." -For each filepath, copy and paste it after the ``flake8`` command as shown below: +If you see output like the below in :ref:`Continuous Integration ` checks: - flake8 +.. code-block:: shell + + Check import format using isort + ERROR: /home/travis/build/pandas-dev/pandas/pandas/io/pytables.py Imports are incorrectly sorted + Check import format using isort DONE + The command "ci/code_checks.sh" exited with 1 + +You should run:: + + isort pandas/io/pytables.py + +to automatically format imports correctly. This will modify your local copy of the files. -Alternatively, you can install the ``grep`` and ``xargs`` commands via the -`MinGW `__ toolchain, and it will allow you to run the -commands above. +The `--recursive` flag can be passed to sort all files in a directory. + +You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `. Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -623,6 +670,8 @@ Otherwise, you need to do it manually: .. code-block:: python + import warnings + def old_func(): """Summary of the function. @@ -632,6 +681,9 @@ Otherwise, you need to do it manually: warnings.warn('Use new_func instead.', FutureWarning, stacklevel=2) new_func() + def new_func(): + pass + You'll also need to 1. write a new test that asserts a warning is issued when calling with the deprecated argument @@ -646,12 +698,12 @@ Testing With Continuous Integration ----------------------------------- The *pandas* test suite will run automatically on `Travis-CI `__, -`Appveyor `__, and `Circle CI `__ continuous integration -services, once your pull request is submitted. +`Azure Pipelines `__, +and `Circle CI `__ continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, then the continuous integration services need to be hooked to your GitHub repository. Instructions are here for `Travis-CI `__, -`Appveyor `__ , and `CircleCI `__. +`Azure Pipelines `__, and `CircleCI `__. A pull-request will be considered for merging when you have an all 'green' build. If any tests are failing, then you will get a red 'X', where you can click through to see the individual failed tests. @@ -661,8 +713,8 @@ This is an example of a green build. .. note:: - Each time you push to *your* fork, a *new* run of the tests will be triggered on the CI. Appveyor will auto-cancel - any non-currently-running tests for that same pull-request. You can enable the auto-cancel feature for + Each time you push to *your* fork, a *new* run of the tests will be triggered on the CI. + You can enable the auto-cancel feature, which removes any non-currently-running tests for that same pull-request, for `Travis-CI here `__ and for `CircleCI here `__. @@ -673,7 +725,7 @@ Test-driven development/code writing ------------------------------------ *pandas* is serious about testing and strongly encourages contributors to embrace -`test-driven development (TDD) `_. +`test-driven development (TDD) `_. This development process "relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired improvement or new function, then produces the minimum amount of code to pass that test." @@ -733,7 +785,7 @@ Transitioning to ``pytest`` .. code-block:: python class TestReallyCoolFeature(object): - .... + pass Going forward, we are moving to a more *functional* style using the `pytest `__ framework, which offers a richer testing framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: @@ -741,7 +793,7 @@ framework that will facilitate testing and developing. Thus, instead of writing .. code-block:: python def test_really_cool_feature(): - .... + pass Using ``pytest`` ~~~~~~~~~~~~~~~~ @@ -766,25 +818,30 @@ We would name this file ``test_cool_feature.py`` and put in an appropriate place import pandas as pd from pandas.util import testing as tm + @pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64']) def test_dtypes(dtype): assert str(np.dtype(dtype)) == dtype - @pytest.mark.parametrize('dtype', ['float32', - pytest.param('int16', marks=pytest.mark.skip), - pytest.param('int32', - marks=pytest.mark.xfail(reason='to show how it works'))]) + + @pytest.mark.parametrize( + 'dtype', ['float32', pytest.param('int16', marks=pytest.mark.skip), + pytest.param('int32', marks=pytest.mark.xfail( + reason='to show how it works'))]) def test_mark(dtype): assert str(np.dtype(dtype)) == 'float32' + @pytest.fixture def series(): return pd.Series([1, 2, 3]) + @pytest.fixture(params=['int8', 'int16', 'int32', 'int64']) def dtype(request): return request.param + def test_series(series, dtype): result = series.astype(dtype) assert result.dtype == dtype @@ -853,6 +910,7 @@ for details `_. st.lists(any_json_value), st.dictionaries(st.text(), any_json_value) )) + @given(value=any_json_value) def test_json_roundtrip(value): result = json.loads(json.dumps(value)) @@ -880,6 +938,8 @@ If your change involves checking that a warning is actually emitted, use .. code-block:: python + df = pd.DataFrame() + with tm.assert_produces_warning(FutureWarning): df.some_operation() @@ -910,7 +970,7 @@ a single test. .. code-block:: python - with warch.catch_warnings(): + with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) # Or use warnings.filterwarnings(...) @@ -1043,7 +1103,7 @@ Information on how to write a benchmark and how to use asv can be found in the Documenting your code --------------------- -Changes should be reflected in the release notes located in ``doc/source/whatsnew/vx.y.z.txt``. +Changes should be reflected in the release notes located in ``doc/source/whatsnew/vx.y.z.rst``. This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using ``:issue:`1234``` where ``1234`` is the @@ -1067,6 +1127,8 @@ or a new keyword argument (`example `_ serves +automatically from docstrings. `Sphinx `_ serves this purpose. Next example gives an idea on how a docstring looks like: @@ -68,7 +68,7 @@ As PEP-257 is quite open, and some other standards exist on top of it. In the case of pandas, the numpy docstring convention is followed. The conventions is explained in this document: -* `numpydoc docstring guide `_ +* `numpydoc docstring guide `_ (which is based in the original `Guide to NumPy/SciPy documentation `_) @@ -78,7 +78,7 @@ The standard uses reStructuredText (reST). reStructuredText is a markup language that allows encoding styles in plain text files. Documentation about reStructuredText can be found in: -* `Sphinx reStructuredText primer `_ +* `Sphinx reStructuredText primer `_ * `Quick reStructuredText reference `_ * `Full reStructuredText specification `_ @@ -119,7 +119,7 @@ backticks. It is considered inline code: function, prefix it with ``~``. For example, ``:class:`~pandas.Series``` will link to ``pandas.Series`` but only display the last part, ``Series`` as the link text. See `Sphinx cross-referencing syntax - `_ + `_ for details. **Good:** @@ -197,6 +197,8 @@ infinitive verb. """ pass +.. code-block:: python + def astype(dtype): """ Method to cast Series type. @@ -205,6 +207,8 @@ infinitive verb. """ pass +.. code-block:: python + def astype(dtype): """ Cast Series type @@ -213,6 +217,8 @@ infinitive verb. """ pass +.. code-block:: python + def astype(dtype): """ Cast Series type from its current type to the new type defined in @@ -624,6 +630,7 @@ A simple example could be: .. code-block:: python class Series: + def head(self, n=5): """ Return the first elements of the Series. @@ -681,12 +688,11 @@ shown: .. code-block:: python - import numpy as np - import pandas as pd - + import numpy as np # noqa: F401 + import pandas as pd # noqa: F401 Any other module used in the examples must be explicitly imported, one per line (as -recommended in `PEP-8 `_) +recommended in :pep:`8#imports`) and avoiding aliases. Avoid excessive imports, but if needed, imports from the standard library go first, followed by third-party libraries (like matplotlib). @@ -720,6 +726,7 @@ positional arguments ``head(3)``. .. code-block:: python class Series: + def mean(self): """ Compute the mean of the input. @@ -946,12 +953,14 @@ substitute the children's class names in this docstring. """Apply my function to %(klass)s.""" ... + class ChildA(Parent): @Substitution(klass="ChildA") @Appender(Parent.my_function.__doc__) def my_function(self): ... + class ChildB(Parent): @Substitution(klass="ChildB") @Appender(Parent.my_function.__doc__) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 21c8ab4128188..16d756acaca51 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -5,24 +5,20 @@ .. ipython:: python :suppress: - import pandas as pd + import datetime + import functools + import glob + import itertools + import os + import numpy as np + import pandas as pd from pandas.compat import StringIO - import random - import os - import itertools - import functools - import datetime np.random.seed(123456) - - pd.options.display.max_rows=15 - - import matplotlib - # matplotlib.style.use('default') - np.set_printoptions(precision=4, suppress=True) + pd.options.display.max_rows = 15 ******** @@ -52,12 +48,14 @@ Idioms These are some neat pandas ``idioms`` `if-then/if-then-else on one column, and assignment to another one or more columns: -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df if-then... ********** @@ -66,98 +64,113 @@ An if-then on one column .. ipython:: python - df.loc[df.AAA >= 5,'BBB'] = -1; df + df.loc[df.AAA >= 5, 'BBB'] = -1 + df An if-then with assignment to 2 columns: .. ipython:: python - df.loc[df.AAA >= 5,['BBB','CCC']] = 555; df + df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555 + df Add another line with different logic, to do the -else .. ipython:: python - df.loc[df.AAA < 5,['BBB','CCC']] = 2000; df + df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000 + df Or use pandas where after you've set up a mask .. ipython:: python - df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2}) - df.where(df_mask,-1000) + df_mask = pd.DataFrame({'AAA': [True] * 4, + 'BBB': [False] * 4, + 'CCC': [True, False] * 2}) + df.where(df_mask, -1000) `if-then-else using numpy's where() -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df - - df['logic'] = np.where(df['AAA'] > 5,'high','low'); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df + df['logic'] = np.where(df['AAA'] > 5, 'high', 'low') + df Splitting ********* `Split a frame with a boolean criterion -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - dflow = df[df.AAA <= 5]; dflow - dfhigh = df[df.AAA > 5]; dfhigh + df[df.AAA <= 5] + df[df.AAA > 5] Building Criteria ***************** `Select with multi-column criteria -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df ...and (without assignment returns a Series) .. ipython:: python - newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']; newseries + df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA'] ...or (without assignment returns a Series) .. ipython:: python - newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries + df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA'] ...or (with assignment modifies the DataFrame.) .. ipython:: python - df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1; df + df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1 + df `Select rows with data closest to certain value using argsort -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df - + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df aValue = 43.0 - df.loc[(df.CCC-aValue).abs().argsort()] + df.loc[(df.CCC - aValue).abs().argsort()] `Dynamically reduce a list of criteria using a binary operators -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df Crit1 = df.AAA <= 5.5 Crit2 = df.BBB == 10.0 @@ -173,8 +186,8 @@ One could hard code: .. ipython:: python - CritList = [Crit1,Crit2,Crit3] - AllCrit = functools.reduce(lambda x,y: x & y, CritList) + CritList = [Crit1, Crit2, Crit3] + AllCrit = functools.reduce(lambda x, y: x & y, CritList) df[AllCrit] @@ -189,22 +202,27 @@ DataFrames The :ref:`indexing ` docs. `Using both row labels and value conditionals -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - df[(df.AAA <= 6) & (df.index.isin([0,2,4]))] + df[(df.AAA <= 6) & (df.index.isin([0, 2, 4]))] `Use loc for label-oriented slicing and iloc positional slicing `__ .. ipython:: python - data = {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]} - df = pd.DataFrame(data=data,index=['foo','bar','boo','kar']); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}, + index=['foo', 'bar', 'boo', 'kar']) + There are 2 explicit slicing methods, with a third general case @@ -213,9 +231,9 @@ There are 2 explicit slicing methods, with a third general case 3. General (Either slicing style : depends on if the slice contains labels or positions) .. ipython:: python - df.iloc[0:3] #Positional + df.iloc[0:3] # Positional - df.loc['bar':'kar'] #Label + df.loc['bar':'kar'] # Label # Generic df.iloc[0:3] @@ -225,21 +243,24 @@ Ambiguity arises when an index consists of integers with a non-zero start or non .. ipython:: python - df2 = pd.DataFrame(data=data,index=[1,2,3,4]); #Note index starts at 1. - - df2.iloc[1:3] #Position-oriented - - df2.loc[1:3] #Label-oriented + data = {'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]} + df2 = pd.DataFrame(data=data, index=[1, 2, 3, 4]) # Note index starts at 1. + df2.iloc[1:3] # Position-oriented + df2.loc[1:3] # Label-oriented `Using inverse operator (~) to take the complement of a mask -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40], 'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))] + df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))] Panels ****** @@ -249,42 +270,50 @@ Panels .. ipython:: python - rng = pd.date_range('1/1/2013',periods=100,freq='D') + rng = pd.date_range('1/1/2013', periods=100, freq='D') data = np.random.randn(100, 4) - cols = ['A','B','C','D'] - df1, df2, df3 = pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols) + cols = ['A', 'B', 'C', 'D'] + df1 = pd.DataFrame(data, rng, cols) + df2 = pd.DataFrame(data, rng, cols) + df3 = pd.DataFrame(data, rng, cols) - pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3});pf + pf = pd.Panel({'df1': df1, 'df2': df2, 'df3': df3}) + pf - pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols);pf + pf.loc[:, :, 'F'] = pd.DataFrame(data, rng, cols) + pf `Mask a panel by using np.where and then reconstructing the panel with the new masked values -`__ +`__ New Columns *********** `Efficiently and dynamically creating new columns using applymap -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [1,2,1,3], 'BBB' : [1,1,2,2], 'CCC' : [2,1,3,1]}); df + df = pd.DataFrame({'AAA': [1, 2, 1, 3], + 'BBB': [1, 1, 2, 2], + 'CCC': [2, 1, 3, 1]}) + df - source_cols = df.columns # or some subset would work too. + source_cols = df.columns # Or some subset would work too new_cols = [str(x) + "_cat" for x in source_cols] - categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' } + categories = {1: 'Alpha', 2: 'Beta', 3: 'Charlie'} - df[new_cols] = df[source_cols].applymap(categories.get);df + df[new_cols] = df[source_cols].applymap(categories.get) + df `Keep other columns when using min() with groupby -`__ +`__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [1,1,1,2,2,2,3,3], 'BBB' : [2,1,3,4,5,1,2,3]}); df + df = pd.DataFrame({'AAA': [1, 1, 1, 2, 2, 2, 3, 3], + 'BBB': [2, 1, 3, 4, 5, 1, 2, 3]}) + df Method 1 : idxmin() to get the index of the minimums @@ -308,92 +337,102 @@ MultiIndexing The :ref:`multindexing ` docs. `Creating a MultiIndex from a labeled frame -`__ +`__ .. ipython:: python - df = pd.DataFrame({'row' : [0,1,2], - 'One_X' : [1.1,1.1,1.1], - 'One_Y' : [1.2,1.2,1.2], - 'Two_X' : [1.11,1.11,1.11], - 'Two_Y' : [1.22,1.22,1.22]}); df + df = pd.DataFrame({'row': [0, 1, 2], + 'One_X': [1.1, 1.1, 1.1], + 'One_Y': [1.2, 1.2, 1.2], + 'Two_X': [1.11, 1.11, 1.11], + 'Two_Y': [1.22, 1.22, 1.22]}) + df # As Labelled Index - df = df.set_index('row');df + df = df.set_index('row') + df # With Hierarchical Columns - df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns]);df + df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) + for c in df.columns]) + df # Now stack & Reset - df = df.stack(0).reset_index(1);df + df = df.stack(0).reset_index(1) + df # And fix the labels (Notice the label 'level_1' got added automatically) - df.columns = ['Sample','All_X','All_Y'];df + df.columns = ['Sample', 'All_X', 'All_Y'] + df Arithmetic ********** `Performing arithmetic with a MultiIndex that needs broadcasting -`__ +`__ .. ipython:: python - cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']]) - df = pd.DataFrame(np.random.randn(2,6),index=['n','m'],columns=cols); df - df = df.div(df['C'],level=1); df + cols = pd.MultiIndex.from_tuples([(x, y) for x in ['A', 'B', 'C'] + for y in ['O', 'I']]) + df = pd.DataFrame(np.random.randn(2, 6), index=['n', 'm'], columns=cols) + df + df = df.div(df['C'], level=1) + df Slicing ******* `Slicing a MultiIndex with xs -`__ +`__ .. ipython:: python - coords = [('AA','one'),('AA','six'),('BB','one'),('BB','two'),('BB','six')] + coords = [('AA', 'one'), ('AA', 'six'), ('BB', 'one'), ('BB', 'two'), + ('BB', 'six')] index = pd.MultiIndex.from_tuples(coords) - df = pd.DataFrame([11,22,33,44,55],index,['MyData']); df + df = pd.DataFrame([11, 22, 33, 44, 55], index, ['MyData']) + df To take the cross section of the 1st level and 1st axis the index: .. ipython:: python - df.xs('BB',level=0,axis=0) #Note : level and axis are optional, and default to zero + # Note : level and axis are optional, and default to zero + df.xs('BB', level=0, axis=0) ...and now the 2nd level of the 1st axis. .. ipython:: python - df.xs('six',level=1,axis=0) + df.xs('six', level=1, axis=0) `Slicing a MultiIndex with xs, method #2 -`__ +`__ .. ipython:: python - index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci'])) - headr = list(itertools.product(['Exams','Labs'],['I','II'])) - - indx = pd.MultiIndex.from_tuples(index,names=['Student','Course']) - cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-named - - data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)] - - df = pd.DataFrame(data,indx,cols); df + index = list(itertools.product(['Ada', 'Quinn', 'Violet'], + ['Comp', 'Math', 'Sci'])) + headr = list(itertools.product(['Exams', 'Labs'], ['I', 'II'])) + indx = pd.MultiIndex.from_tuples(index, names=['Student', 'Course']) + cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named + data = [[70 + x + y + (x * y) % 3 for x in range(4)] for y in range(9)] + df = pd.DataFrame(data, indx, cols) + df All = slice(None) - df.loc['Violet'] - df.loc[(All,'Math'),All] - df.loc[(slice('Ada','Quinn'),'Math'),All] - df.loc[(All,'Math'),('Exams')] - df.loc[(All,'Math'),(All,'II')] + df.loc[(All, 'Math'), All] + df.loc[(slice('Ada', 'Quinn'), 'Math'), All] + df.loc[(All, 'Math'), ('Exams')] + df.loc[(All, 'Math'), (All, 'II')] `Setting portions of a MultiIndex with xs -`__ +`__ Sorting ******* `Sort by specific column or an ordered list of columns, with a MultiIndex -`__ +`__ .. ipython:: python @@ -422,7 +461,9 @@ Fill forward a reversed timeseries .. ipython:: python - df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A')) + df = pd.DataFrame(np.random.randn(6, 1), + index=pd.date_range('2013-08-01', periods=6, freq='B'), + columns=list('A')) df.loc[df.index[3], 'A'] = np.nan df df.reindex(df.index[::-1]).ffill() @@ -453,9 +494,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(), 'size': list('SSMMMLL'), 'weight': [8, 10, 11, 1, 20, 12, 12], - 'adult' : [False] * 5 + [True] * 2}); df + 'adult': [False] * 5 + [True] * 2}) + df - #List the size of the animals with the highest weight. + # List the size of the animals with the highest weight. df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) `Using get_group @@ -464,7 +506,6 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python gb = df.groupby(['animal']) - gb.get_group('cat') `Apply to different items in a group @@ -473,14 +514,14 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python def GrowUp(x): - avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) - avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) - avg_weight += sum(x[x['size'] == 'L'].weight) - avg_weight /= len(x) - return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult']) + avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) + avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) + avg_weight += sum(x[x['size'] == 'L'].weight) + avg_weight /= len(x) + return pd.Series(['L', avg_weight, True], + index=['size', 'weight', 'adult']) expected_df = gb.apply(GrowUp) - expected_df `Expanding Apply @@ -488,15 +529,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - S = pd.Series([i / 100.0 for i in range(1,11)]) + S = pd.Series([i / 100.0 for i in range(1, 11)]) - def CumRet(x,y): - return x * (1 + y) + def cum_ret(x, y): + return x * (1 + y) - def Red(x): - return functools.reduce(CumRet,x,1.0) + def red(x): + return functools.reduce(cum_ret, x, 1.0) - S.expanding().apply(Red, raw=True) + S.expanding().apply(red, raw=True) `Replacing some values with mean of the rest of a group @@ -504,7 +545,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]}) + df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, -1, 1, 2]}) gb = df.groupby('A') def replace(g): @@ -535,15 +576,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - rng = pd.date_range(start="2014-10-07",periods=10,freq='2min') - ts = pd.Series(data = list(range(10)), index = rng) + rng = pd.date_range(start="2014-10-07", periods=10, freq='2min') + ts = pd.Series(data=list(range(10)), index=rng) def MyCust(x): - if len(x) > 2: - return x[1] * 1.234 - return pd.NaT + if len(x) > 2: + return x[1] * 1.234 + return pd.NaT - mhc = {'Mean' : np.mean, 'Max' : np.max, 'Custom' : MyCust} + mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust} ts.resample("5min").apply(mhc) ts @@ -553,7 +594,8 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(), - 'Value': [100, 150, 50, 50]}); df + 'Value': [100, 150, 50, 50]}) + df df['Counts'] = df.groupby(['Color']).transform(len) df @@ -562,11 +604,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame( - {u'line_race': [10, 10, 8, 10, 10, 8], - u'beyer': [99, 102, 103, 103, 88, 100]}, - index=[u'Last Gunfighter', u'Last Gunfighter', u'Last Gunfighter', - u'Paynter', u'Paynter', u'Paynter']); df + df = pd.DataFrame({'line_race': [10, 10, 8, 10, 10, 8], + 'beyer': [99, 102, 103, 103, 88, 100]}, + index=['Last Gunfighter', 'Last Gunfighter', + 'Last Gunfighter', 'Paynter', 'Paynter', + 'Paynter']) + df df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) df @@ -575,9 +618,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'host':['other','other','that','this','this'], - 'service':['mail','web','mail','mail','web'], - 'no':[1, 2, 1, 2, 1]}).set_index(['host', 'service']) + df = pd.DataFrame({'host': ['other', 'other', 'that', 'this', 'this'], + 'service': ['mail', 'web', 'mail', 'mail', 'web'], + 'no': [1, 2, 1, 2, 1]}).set_index(['host', 'service']) mask = df.groupby(level=0).agg('idxmax') df_count = df.loc[mask['no']].reset_index() df_count @@ -613,10 +656,12 @@ Create a list of dataframes, split using a delineation based on logic included i .. ipython:: python - df = pd.DataFrame(data={'Case' : ['A','A','A','B','A','A','B','A','A'], - 'Data' : np.random.randn(9)}) + df = pd.DataFrame(data={'Case': ['A', 'A', 'A', 'B', 'A', 'A', 'B', 'A', + 'A'], + 'Data': np.random.randn(9)}) - dfs = list(zip(*df.groupby((1*(df['Case']=='B')).cumsum().rolling(window=3,min_periods=1).median())))[-1] + dfs = list(zip(*df.groupby((1 * (df['Case'] == 'B')).cumsum() + .rolling(window=3, min_periods=1).median())))[-1] dfs[0] dfs[1] @@ -633,10 +678,13 @@ The :ref:`Pivot ` docs. .. ipython:: python - df = pd.DataFrame(data={'Province' : ['ON','QC','BC','AL','AL','MN','ON'], - 'City' : ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'], - 'Sales' : [13,6,16,8,4,3,1]}) - table = pd.pivot_table(df,values=['Sales'],index=['Province'],columns=['City'],aggfunc=np.sum,margins=True) + df = pd.DataFrame(data={'Province': ['ON', 'QC', 'BC', 'AL', 'AL', 'MN', 'ON'], + 'City': ['Toronto', 'Montreal', 'Vancouver', + 'Calgary', 'Edmonton', 'Winnipeg', + 'Windsor'], + 'Sales': [13, 6, 16, 8, 4, 3, 1]}) + table = pd.pivot_table(df, values=['Sales'], index=['Province'], + columns=['City'], aggfunc=np.sum, margins=True) table.stack('City') `Frequency table like plyr in R @@ -644,20 +692,26 @@ The :ref:`Pivot ` docs. .. ipython:: python - grades = [48,99,75,80,42,80,72,68,36,78] - df = pd.DataFrame( {'ID': ["x%d" % r for r in range(10)], - 'Gender' : ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'], - 'ExamYear': ['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'], - 'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'], - 'Participated': ['yes','yes','yes','yes','no','yes','yes','yes','yes','yes'], - 'Passed': ['yes' if x > 50 else 'no' for x in grades], - 'Employed': [True,True,True,False,False,False,False,True,True,False], - 'Grade': grades}) + grades = [48, 99, 75, 80, 42, 80, 72, 68, 36, 78] + df = pd.DataFrame({'ID': ["x%d" % r for r in range(10)], + 'Gender': ['F', 'M', 'F', 'M', 'F', + 'M', 'F', 'M', 'M', 'M'], + 'ExamYear': ['2007', '2007', '2007', '2008', '2008', + '2008', '2008', '2009', '2009', '2009'], + 'Class': ['algebra', 'stats', 'bio', 'algebra', + 'algebra', 'stats', 'stats', 'algebra', + 'bio', 'bio'], + 'Participated': ['yes', 'yes', 'yes', 'yes', 'no', + 'yes', 'yes', 'yes', 'yes', 'yes'], + 'Passed': ['yes' if x > 50 else 'no' for x in grades], + 'Employed': [True, True, True, False, + False, False, False, True, True, False], + 'Grade': grades}) df.groupby('ExamYear').agg({'Participated': lambda x: x.value_counts()['yes'], - 'Passed': lambda x: sum(x == 'yes'), - 'Employed' : lambda x : sum(x), - 'Grade' : lambda x : sum(x) / len(x)}) + 'Passed': lambda x: sum(x == 'yes'), + 'Employed': lambda x: sum(x), + 'Grade': lambda x: sum(x) / len(x)}) `Plot pandas DataFrame with year over year data `__ @@ -680,12 +734,16 @@ Apply .. ipython:: python - df = pd.DataFrame(data={'A' : [[2,4,8,16],[100,200],[10,20,30]], 'B' : [['a','b','c'],['jj','kk'],['ccc']]},index=['I','II','III']) + df = pd.DataFrame(data={'A': [[2, 4, 8, 16], [100, 200], [10, 20, 30]], + 'B': [['a', 'b', 'c'], ['jj', 'kk'], ['ccc']]}, + index=['I', 'II', 'III']) def SeriesFromSubList(aList): - return pd.Series(aList) + return pd.Series(aList) - df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in df.iterrows() ])) + df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) + for ind, row in df.iterrows()}) + df_orgz `Rolling Apply with a DataFrame returning a Series `__ @@ -694,15 +752,18 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc .. ipython:: python - df = pd.DataFrame(data=np.random.randn(2000,2)/10000, - index=pd.date_range('2001-01-01',periods=2000), - columns=['A','B']); df + df = pd.DataFrame(data=np.random.randn(2000, 2) / 10000, + index=pd.date_range('2001-01-01', periods=2000), + columns=['A', 'B']) + df - def gm(aDF,Const): - v = ((((aDF.A+aDF.B)+1).cumprod())-1)*Const - return (aDF.index[0],v.iloc[-1]) + def gm(df, const): + v = ((((df.A + df.B) + 1).cumprod()) - 1) * const + return v.iloc[-1] - S = pd.Series(dict([ gm(df.iloc[i:min(i+51,len(df)-1)],5) for i in range(len(df)-50) ])); S + s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) + for i in range(len(df) - 50)}) + s `Rolling apply with a DataFrame returning a Scalar `__ @@ -711,14 +772,20 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight .. ipython:: python - rng = pd.date_range(start = '2014-01-01',periods = 100) - df = pd.DataFrame({'Open' : np.random.randn(len(rng)), - 'Close' : np.random.randn(len(rng)), - 'Volume' : np.random.randint(100,2000,len(rng))}, index=rng); df + rng = pd.date_range(start='2014-01-01', periods=100) + df = pd.DataFrame({'Open': np.random.randn(len(rng)), + 'Close': np.random.randn(len(rng)), + 'Volume': np.random.randint(100, 2000, len(rng))}, + index=rng) + df + + def vwap(bars): + return ((bars.Close * bars.Volume).sum() / bars.Volume.sum()) - def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()) window = 5 - s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(len(df)-window) ]); + s = pd.concat([(pd.Series(vwap(df.iloc[i:i + window]), + index=[df.index[i + window]])) + for i in range(len(df) - window)]) s.round(2) Timeseries @@ -806,21 +873,25 @@ Depending on df construction, ``ignore_index`` may be needed .. ipython:: python - df = df1.append(df2,ignore_index=True); df + df = df1.append(df2, ignore_index=True) + df `Self Join of a DataFrame `__ .. ipython:: python - df = pd.DataFrame(data={'Area' : ['A'] * 5 + ['C'] * 2, - 'Bins' : [110] * 2 + [160] * 3 + [40] * 2, - 'Test_0' : [0, 1, 0, 1, 2, 0, 1], - 'Data' : np.random.randn(7)});df + df = pd.DataFrame(data={'Area': ['A'] * 5 + ['C'] * 2, + 'Bins': [110] * 2 + [160] * 3 + [40] * 2, + 'Test_0': [0, 1, 0, 1, 2, 0, 1], + 'Data': np.random.randn(7)}) + df df['Test_1'] = df['Test_0'] - 1 - pd.merge(df, df, left_on=['Bins', 'Area','Test_0'], right_on=['Bins', 'Area','Test_1'],suffixes=('_L','_R')) + pd.merge(df, df, left_on=['Bins', 'Area', 'Test_0'], + right_on=['Bins', 'Area', 'Test_1'], + suffixes=('_L', '_R')) `How to set the index and join `__ @@ -871,16 +942,16 @@ The :ref:`Plotting ` docs. .. ipython:: python df = pd.DataFrame( - {u'stratifying_var': np.random.uniform(0, 100, 20), - u'price': np.random.normal(100, 5, 20)}) + {'stratifying_var': np.random.uniform(0, 100, 20), + 'price': np.random.normal(100, 5, 20)}) - df[u'quartiles'] = pd.qcut( - df[u'stratifying_var'], + df['quartiles'] = pd.qcut( + df['stratifying_var'], 4, - labels=[u'0-25%', u'25-50%', u'50-75%', u'75-100%']) + labels=['0-25%', '25-50%', '50-75%', '75-100%']) @savefig quartile_boxplot.png - df.boxplot(column=u'price', by=u'quartiles') + df.boxplot(column='price', by='quartiles') Data In/Out ----------- @@ -951,7 +1022,6 @@ You can use the same approach to read all files matching a pattern. Here is an .. ipython:: python - import glob files = glob.glob('file_*.csv') result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) @@ -968,38 +1038,17 @@ Parsing date components in multi-columns Parsing date components in multi-columns is faster with a format -.. code-block:: python - - In [30]: i = pd.date_range('20000101',periods=10000) - - In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day)) - - In [32]: df.head() - Out[32]: - day month year - 0 1 1 2000 - 1 2 1 2000 - 2 3 1 2000 - 3 4 1 2000 - 4 5 1 2000 - - In [33]: %timeit pd.to_datetime(df.year*10000+df.month*100+df.day,format='%Y%m%d') - 100 loops, best of 3: 7.08 ms per loop - - # simulate combinging into a string, then parsing - In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'],x['month'],x['day']),axis=1) +.. ipython:: python - In [35]: ds.head() - Out[35]: - 0 20000101 - 1 20000102 - 2 20000103 - 3 20000104 - 4 20000105 - dtype: object + i = pd.date_range('20000101', periods=10000) + df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) + df.head() + %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') + ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], + x['month'], x['day']), axis=1) + ds.head() + %timeit pd.to_datetime(ds) - In [36]: %timeit pd.to_datetime(ds) - 1 loops, best of 3: 488 ms per loop Skip row between header and data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1032,8 +1081,8 @@ Option 1: pass rows explicitly to skip rows .. ipython:: python - pd.read_csv(StringIO(data), sep=';', skiprows=[11,12], - index_col=0, parse_dates=True, header=10) + pd.read_csv(StringIO(data), sep=';', skiprows=[11, 12], + index_col=0, parse_dates=True, header=10) Option 2: read column names and then data """"""""""""""""""""""""""""""""""""""""" @@ -1138,12 +1187,12 @@ Storing Attributes to a group node .. ipython:: python - df = pd.DataFrame(np.random.randn(8,3)) + df = pd.DataFrame(np.random.randn(8, 3)) store = pd.HDFStore('test.h5') - store.put('df',df) + store.put('df', df) # you can store an arbitrary Python object via pickle - store.get_storer('df').attrs.my_attribute = dict(A = 10) + store.get_storer('df').attrs.my_attribute = {'A': 10} store.get_storer('df').attrs.my_attribute .. ipython:: python @@ -1226,38 +1275,47 @@ Computation Correlation *********** -The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types. Here we compute the `distance correlation `__ matrix for a `DataFrame` object. +Often it's useful to obtain the lower (or upper) triangular form of a correlation matrix calculated from :func:`DataFrame.corr`. This can be achieved by passing a boolean mask to ``where`` as follows: .. ipython:: python - def distcorr(x, y): - n = len(x) - a = np.zeros(shape=(n, n)) - b = np.zeros(shape=(n, n)) + df = pd.DataFrame(np.random.random(size=(100, 5))) + + corr_mat = df.corr() + mask = np.tril(np.ones_like(corr_mat, dtype=np.bool), k=-1) + + corr_mat.where(mask) - for i in range(n): - for j in range(i + 1, n): - a[i, j] = abs(x[i] - x[j]) - b[i, j] = abs(y[i] - y[j]) +The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types. Here we compute the `distance correlation `__ matrix for a `DataFrame` object. + +.. ipython:: python - a += a.T - b += b.T + def distcorr(x, y): + n = len(x) + a = np.zeros(shape=(n, n)) + b = np.zeros(shape=(n, n)) - a_bar = np.vstack([np.nanmean(a, axis=0)] * n) - b_bar = np.vstack([np.nanmean(b, axis=0)] * n) + for i in range(n): + for j in range(i + 1, n): + a[i, j] = abs(x[i] - x[j]) + b[i, j] = abs(y[i] - y[j]) - A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean()) - B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean()) + a += a.T + b += b.T - cov_ab = np.sqrt(np.nansum(A * B)) / n - std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n) - std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n) + a_bar = np.vstack([np.nanmean(a, axis=0)] * n) + b_bar = np.vstack([np.nanmean(b, axis=0)] * n) - return cov_ab / std_a / std_b + A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean()) + B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean()) + cov_ab = np.sqrt(np.nansum(A * B)) / n + std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n) + std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n) - df = pd.DataFrame(np.random.normal(size=(100, 3))) + return cov_ab / std_a / std_b - df.corr(method=distcorr) + df = pd.DataFrame(np.random.normal(size=(100, 3))) + df.corr(method=distcorr) Timedeltas ---------- @@ -1269,17 +1327,17 @@ The :ref:`Timedeltas ` docs. .. ipython:: python - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) s - s.max() s.max() - s - s - datetime.datetime(2011,1,1,3,5) + s - datetime.datetime(2011, 1, 1, 3, 5) s + datetime.timedelta(minutes=5) - datetime.datetime(2011,1,1,3,5) - s + datetime.datetime(2011, 1, 1, 3, 5) - s datetime.timedelta(minutes=5) + s @@ -1288,13 +1346,15 @@ The :ref:`Timedeltas ` docs. .. ipython:: python - deltas = pd.Series([ datetime.timedelta(days=i) for i in range(3) ]) + deltas = pd.Series([datetime.timedelta(days=i) for i in range(3)]) - df = pd.DataFrame(dict(A = s, B = deltas)); df + df = pd.DataFrame({'A': s, 'B': deltas}) + df - df['New Dates'] = df['A'] + df['B']; + df['New Dates'] = df['A'] + df['B'] - df['Delta'] = df['A'] - df['New Dates']; df + df['Delta'] = df['A'] - df['New Dates'] + df df.dtypes @@ -1305,9 +1365,11 @@ Values can be set to NaT using np.nan, similar to datetime .. ipython:: python - y = s - s.shift(); y + y = s - s.shift() + y - y[1] = np.nan; y + y[1] = np.nan + y Aliasing Axis Names ------------------- @@ -1317,23 +1379,24 @@ To globally provide aliases for axis names, one can define these 2 functions: .. ipython:: python def set_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES[alias] = axis + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES[alias] = axis .. ipython:: python def clear_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES.pop(alias,None) + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES.pop(alias, None) .. ipython:: python - set_axis_alias(pd.DataFrame,'columns', 'myaxis2') - df2 = pd.DataFrame(np.random.randn(3,2),columns=['c1','c2'],index=['i1','i2','i3']) + set_axis_alias(pd.DataFrame, 'columns', 'myaxis2') + df2 = pd.DataFrame(np.random.randn(3, 2), columns=['c1', 'c2'], + index=['i1', 'i2', 'i3']) df2.sum(axis='myaxis2') - clear_axis_alias(pd.DataFrame,'columns', 'myaxis2') + clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2') Creating Example Data --------------------- @@ -1346,11 +1409,10 @@ of the data values: def expand_grid(data_dict): - rows = itertools.product(*data_dict.values()) - return pd.DataFrame.from_records(rows, columns=data_dict.keys()) + rows = itertools.product(*data_dict.values()) + return pd.DataFrame.from_records(rows, columns=data_dict.keys()) - df = expand_grid( - {'height': [60, 70], - 'weight': [100, 140, 180], - 'sex': ['Male', 'Female']}) + df = expand_grid({'height': [60, 70], + 'weight': [100, 140, 180], + 'sex': ['Male', 'Female']}) df diff --git a/doc/source/developer.rst b/doc/source/developer.rst index f76af394abc48..6be58f20087b5 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -2,15 +2,6 @@ .. currentmodule:: pandas -.. ipython:: python - :suppress: - - import numpy as np - np.random.seed(123456) - np.set_printoptions(precision=4, suppress=True) - import pandas as pd - pd.options.display.max_rows = 15 - ********* Developer ********* diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index efa52a6f7cfe2..968b30d7e9e2b 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -137,7 +137,43 @@ However, operations such as slicing will also slice the index. s[[4, 3, 1]] np.exp(s) -We will address array-based indexing in a separate :ref:`section `. +.. note:: + + We will address array-based indexing like ``s[[4, 3, 1]]`` + in :ref:`section `. + +Like a NumPy array, a pandas Series has a :attr:`~Series.dtype`. + +.. ipython:: python + + s.dtype + +This is often a NumPy dtype. However, pandas and 3rd-party libraries +extend NumPy's type system in a few places, in which case the dtype would +be a :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within +pandas are :ref:`categorical` and :ref:`integer_na`. See :ref:`basics.dtypes` +for more. + +If you need the actual array backing a ``Series``, use :attr:`Series.array`. + +.. ipython:: python + + s.array + +Again, this is often a NumPy array, but may instead be a +:class:`~pandas.api.extensions.ExtensionArray`. See :ref:`basics.dtypes` for more. +Accessing the array can be useful when you need to do some operation without the +index (to disable :ref:`automatic alignment `, for example). + +While Series is ndarray-like, if you need an *actual* ndarray, then use +:meth:`Series.to_numpy`. + +.. ipython:: python + + s.to_numpy() + +Even if the Series is backed by a :class:`~pandas.api.extensions.ExtensionArray`, +:meth:`Series.to_numpy` will return a NumPy ndarray. Series is dict-like ~~~~~~~~~~~~~~~~~~~ @@ -249,7 +285,7 @@ pandas object. Like Series, DataFrame accepts many different kinds of input: * Dict of 1D ndarrays, lists, dicts, or Series * 2-D numpy.ndarray * `Structured or record - `__ ndarray + `__ ndarray * A ``Series`` * Another ``DataFrame`` @@ -476,7 +512,7 @@ Assigning New Columns in Method Chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Inspired by `dplyr's -`__ +`__ ``mutate`` verb, DataFrame has an :meth:`~pandas.DataFrame.assign` method that allows you to easily create new columns that are potentially derived from existing columns. @@ -566,13 +602,12 @@ To write code compatible with all versions of Python, split the assignment in tw .. code-block:: python >>> dependent = pd.DataFrame({"A": [1, 1, 1]}) - >>> dependent.assign(A=lambda x: x["A"] + 1, - B=lambda x: x["A"] + 2) + >>> dependent.assign(A=lambda x: x["A"] + 1, B=lambda x: x["A"] + 2) For Python 3.5 and earlier the expression creating ``B`` refers to the "old" value of ``A``, ``[1, 1, 1]``. The output is then - .. code-block:: python + .. code-block:: console A B 0 2 3 @@ -582,7 +617,7 @@ To write code compatible with all versions of Python, split the assignment in tw For Python 3.6 and later, the expression creating ``A`` refers to the "new" value of ``A``, ``[2, 2, 2]``, which results in - .. code-block:: python + .. code-block:: console A B 0 2 4 @@ -618,6 +653,8 @@ slicing, see the :ref:`section on indexing `. We will address the fundamentals of reindexing / conforming to new sets of labels in the :ref:`section on reindexing `. +.. _dsintro.alignment: + Data alignment and arithmetic ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -815,7 +852,7 @@ accessed like an attribute: df df.foo1 -The columns are also connected to the `IPython `__ +The columns are also connected to the `IPython `__ completion mechanism so they can be tab-completed: .. code-block:: ipython @@ -834,7 +871,7 @@ Panel a future version. See the section :ref:`Deprecate Panel `. Panel is a somewhat less-used, but still important container for 3-dimensional -data. The term `panel data `__ is +data. The term `panel data `__ is derived from econometrics and is partially responsible for the name pandas: pan(el)-da(ta)-s. The names for the 3 axes are intended to give some semantic meaning to describing operations involving panel data and, in particular, @@ -924,7 +961,7 @@ From DataFrame using ``to_panel`` method .. ipython:: python :okwarning: - midx = pd.MultiIndex(levels=[['one', 'two'], ['x','y']], labels=[[1,1,0,0],[1,0,1,0]]) + midx = pd.MultiIndex(levels=[['one', 'two'], ['x','y']], codes=[[1,1,0,0],[1,0,1,0]]) df = pd.DataFrame({'A' : [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=midx) df.to_panel() @@ -1024,11 +1061,12 @@ Oftentimes, one can simply use a MultiIndex ``DataFrame`` for easily working wit In addition, the ``xarray`` package was built from the ground up, specifically in order to support the multi-dimensional analysis that is one of ``Panel`` s main use cases. -`Here is a link to the xarray panel-transition documentation `__. +`Here is a link to the xarray panel-transition documentation `__. .. ipython:: python :okwarning: + import pandas.util.testing as tm p = tm.makePanel() p @@ -1046,4 +1084,4 @@ Alternatively, one can convert to an xarray ``DataArray``. p.to_xarray() -You can see the full-documentation for the `xarray package `__. +You can see the full-documentation for the `xarray package `__. diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 7fffcadd8ee8c..ad389bbe35b71 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -27,8 +27,8 @@ substantial projects that you feel should be on this list, please let us know. Statistics and Machine Learning ------------------------------- -`Statsmodels `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Statsmodels `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Statsmodels is the prominent Python "statistics and econometrics library" and it has a long-standing special relationship with pandas. Statsmodels provides powerful statistics, @@ -38,7 +38,7 @@ Statsmodels leverages pandas objects as the underlying data container for comput `sklearn-pandas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Use pandas DataFrames in your `scikit-learn `__ +Use pandas DataFrames in your `scikit-learn `__ ML pipeline. `Featuretools `__ @@ -62,8 +62,8 @@ simplicity produces beautiful and effective visualizations with a minimal amount of code. Altair works with Pandas DataFrames. -`Bokeh `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Bokeh `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Bokeh is a Python interactive visualization library for large datasets that natively uses the latest web technologies. Its goal is to provide elegant, concise construction of novel @@ -74,7 +74,7 @@ large data to thin clients. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Seaborn is a Python visualization library based on -`matplotlib `__. It provides a high-level, dataset-oriented +`matplotlib `__. It provides a high-level, dataset-oriented interface for creating attractive statistical graphics. The plotting functions in seaborn understand pandas objects and leverage pandas grouping operations internally to support concise specification of complex visualizations. Seaborn @@ -85,8 +85,8 @@ fit of statistical models to emphasize patterns in a dataset. `yhat/ggpy `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. -Based on `"The Grammar of Graphics" `__ it +Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. +Based on `"The Grammar of Graphics" `__ it provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. It's really quite incredible. Various implementations to other languages are available, but a faithful implementation for Python users has long been missing. Although still young @@ -102,7 +102,7 @@ progressing quickly in that direction. `Plotly `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. `QtPandas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -116,8 +116,8 @@ library enables DataFrame visualization and manipulation in PyQt4 and PySide app IDE ------ -`IPython `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`IPython `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IPython is an interactive command shell and distributed computing environment. IPython tab completion works with Pandas methods and also @@ -140,7 +140,7 @@ which are utilized by Jupyter Notebook for displaying (Note: HTML tables may or may not be compatible with non-HTML Jupyter output formats.) -See :ref:`Options and Settings ` and :ref:`options.available ` +See :ref:`Options and Settings ` and :ref:`options.available` for pandas ``display.`` settings. `quantopian/qgrid `__ @@ -221,7 +221,7 @@ This package requires valid credentials for this API (non free). ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pandaSDMX is a library to retrieve and acquire statistical data and metadata disseminated in -`SDMX `_ 2.1, an ISO-standard +`SDMX `_ 2.1, an ISO-standard widely used by institutions such as statistics offices, central banks, and international organisations. pandaSDMX can expose datasets and related structural metadata including data flows, code-lists, @@ -230,7 +230,7 @@ or MultiIndexed DataFrames. `fredapi `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -fredapi is a Python interface to the `Federal Reserve Economic Data (FRED) `__ +fredapi is a Python interface to the `Federal Reserve Economic Data (FRED) `__ provided by the Federal Reserve Bank of St. Louis. It works with both the FRED database and ALFRED database that contains point-in-time data (i.e. historic data revisions). fredapi provides a wrapper in Python to the FRED HTTP API, and also provides several convenient methods for parsing and analyzing point-in-time data from ALFRED. @@ -316,7 +316,7 @@ Increasingly, packages are being built on top of pandas to address specific need Data validation --------------- -`Engarde `__ +`Engarde `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Engarde is a lightweight library used to explicitly state your assumptions about your datasets diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 8f8a9fe3e50e0..1c873d604cfe0 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -221,7 +221,7 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra You can **not pass** a ``Series`` directly as a ``ndarray`` typed parameter to a Cython function. Instead pass the actual ``ndarray`` using the - ``.values`` attribute of the ``Series``. The reason is that the Cython + :meth:`Series.to_numpy`. The reason is that the Cython definition is specific to an ndarray and not the passed ``Series``. So, do not do this: @@ -230,11 +230,13 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra apply_integrate_f(df['a'], df['b'], df['N']) - But rather, use ``.values`` to get the underlying ``ndarray``: + But rather, use :meth:`Series.to_numpy` to get the underlying ``ndarray``: .. code-block:: python - apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + apply_integrate_f(df['a'].to_numpy(), + df['b'].to_numpy(), + df['N'].to_numpy()) .. note:: @@ -298,7 +300,7 @@ advanced Cython techniques: Even faster, with the caveat that a bug in our Cython code (an off-by-one error, for example) might cause a segfault because memory access isn't checked. -For more about ``boundscheck`` and ``wraparound``, see the Cython docs on +For more about ``boundscheck`` and ``wraparound``, see the Cython docs on `compiler directives `__. .. _enhancingperf.numba: @@ -323,39 +325,45 @@ Numba works by generating optimized machine code using the LLVM compiler infrast Jit ~~~ -We demonstrate how to use Numba to just-in-time compile our code. We simply +We demonstrate how to use Numba to just-in-time compile our code. We simply take the plain Python code from above and annotate with the ``@jit`` decorator. .. code-block:: python import numba + @numba.jit def f_plain(x): - return x * (x - 1) + return x * (x - 1) + @numba.jit def integrate_f_numba(a, b, N): - s = 0 - dx = (b - a) / N - for i in range(N): - s += f_plain(a + i * dx) - return s * dx + s = 0 + dx = (b - a) / N + for i in range(N): + s += f_plain(a + i * dx) + return s * dx + @numba.jit def apply_integrate_f_numba(col_a, col_b, col_N): - n = len(col_N) - result = np.empty(n, dtype='float64') - assert len(col_a) == len(col_b) == n - for i in range(n): - result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i]) - return result + n = len(col_N) + result = np.empty(n, dtype='float64') + assert len(col_a) == len(col_b) == n + for i in range(n): + result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i]) + return result + def compute_numba(df): - result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values) - return pd.Series(result, index=df.index, name='result') + result = apply_integrate_f_numba(df['a'].values, df['b'].values, + df['N'].values) + return pd.Series(result, index=df.index, name='result') -Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a nicer interface by passing/returning pandas objects. +Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a +nicer interface by passing/returning pandas objects. .. code-block:: ipython @@ -375,13 +383,16 @@ Consider the following toy example of doubling each observation: import numba + def double_every_value_nonumba(x): - return x*2 + return x * 2 + @numba.vectorize def double_every_value_withnumba(x): - return x*2 + return x * 2 +.. code-block:: ipython # Custom function without numba In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) @@ -402,18 +413,18 @@ Caveats Numba will execute on any function, but can only accelerate certain classes of functions. -Numba is best at accelerating functions that apply numerical functions to NumPy -arrays. When passed a function that only uses operations it knows how to +Numba is best at accelerating functions that apply numerical functions to NumPy +arrays. When passed a function that only uses operations it knows how to accelerate, it will execute in ``nopython`` mode. -If Numba is passed a function that includes something it doesn't know how to -work with -- a category that currently includes sets, lists, dictionaries, or -string functions -- it will revert to ``object mode``. In ``object mode``, -Numba will execute but your code will not speed up significantly. If you would -prefer that Numba throw an error if it cannot compile a function in a way that -speeds up your code, pass Numba the argument -``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on -troubleshooting Numba modes, see the `Numba troubleshooting page +If Numba is passed a function that includes something it doesn't know how to +work with -- a category that currently includes sets, lists, dictionaries, or +string functions -- it will revert to ``object mode``. In ``object mode``, +Numba will execute but your code will not speed up significantly. If you would +prefer that Numba throw an error if it cannot compile a function in a way that +speeds up your code, pass Numba the argument +``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on +troubleshooting Numba modes, see the `Numba troubleshooting page `__. Read more in the `Numba docs `__. diff --git a/doc/source/extending.rst b/doc/source/extending.rst index ab940384594bc..7046981a3a364 100644 --- a/doc/source/extending.rst +++ b/doc/source/extending.rst @@ -135,6 +135,12 @@ There are two approaches for providing operator support for your ExtensionArray: 2. Use an operator implementation from pandas that depends on operators that are already defined on the underlying elements (scalars) of the ExtensionArray. +.. note:: + + Regardless of the approach, you may want to set ``__array_priority__`` + if you want your implementation to be called when involved in binary operations + with NumPy arrays. + For the first approach, you define selected operators, e.g., ``__add__``, ``__le__``, etc. that you want your ``ExtensionArray`` subclass to support. @@ -157,6 +163,7 @@ your ``MyExtensionArray`` class, as follows: class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): pass + MyExtensionArray._add_arithmetic_ops() MyExtensionArray._add_comparison_ops() @@ -173,6 +180,16 @@ or not that succeeds depends on whether the operation returns a result that's valid for the ``ExtensionArray``. If an ``ExtensionArray`` cannot be reconstructed, an ndarray containing the scalars returned instead. +For ease of implementation and consistency with operations between pandas +and NumPy ndarrays, we recommend *not* handling Series and Indexes in your binary ops. +Instead, you should detect these cases and return ``NotImplemented``. +When pandas encounters an operation like ``op(Series, ExtensionArray)``, pandas +will + +1. unbox the array from the ``Series`` (``Series.array``) +2. call ``result = op(values, ExtensionArray)`` +3. re-box the result in a ``Series`` + .. _extending.extension.testing: Testing Extension Arrays @@ -189,6 +206,7 @@ To use a test, subclass it: from pandas.tests.extension import base + class TestConstructors(base.BaseConstructorsTests): pass @@ -261,6 +279,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame def _constructor_expanddim(self): return SubclassedDataFrame + class SubclassedDataFrame(DataFrame): @property @@ -281,7 +300,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(to_framed) - >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -297,6 +316,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame 0 1 4 1 2 5 2 3 6 + >>> type(sliced1) @@ -306,6 +326,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame 1 2 2 3 Name: A, dtype: int64 + >>> type(sliced2) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 79e312ca12833..c62b836ed1f33 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -9,14 +9,11 @@ Frequently Asked Questions (FAQ) :suppress: import numpy as np + import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import pandas as pd pd.options.display.max_rows = 15 - import matplotlib - # matplotlib.style.use('default') - import matplotlib.pyplot as plt - plt.close('all') .. _df-memory-usage: @@ -36,8 +33,7 @@ when calling :meth:`~DataFrame.info`: dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool'] n = 5000 - data = dict([(t, np.random.randint(100, size=n).astype(t)) - for t in dtypes]) + data = {t: np.random.randint(100, size=n).astype(t) for t in dtypes} df = pd.DataFrame(data) df['categorical'] = df['object'].astype('category') @@ -99,7 +95,7 @@ of the following code should be: .. code-block:: python >>> if pd.Series([False, True, False]): - ... + ... pass Should it be ``True`` because it's not zero-length, or ``False`` because there are ``False`` values? It is unclear, so instead, pandas raises a ``ValueError``: @@ -107,7 +103,7 @@ are ``False`` values? It is unclear, so instead, pandas raises a ``ValueError``: .. code-block:: python >>> if pd.Series([False, True, False]): - print("I was true") + ... print("I was true") Traceback ... ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). @@ -119,8 +115,8 @@ Alternatively, you might want to compare if the pandas object is ``None``: .. code-block:: python >>> if pd.Series([False, True, False]) is not None: - print("I was not None") - >>> I was not None + ... print("I was not None") + I was not None Below is how to check if any of the values are ``True``: @@ -128,8 +124,8 @@ Below is how to check if any of the values are ``True``: .. code-block:: python >>> if pd.Series([False, True, False]).any(): - print("I am any") - >>> I am any + ... print("I am any") + I am any To evaluate single-element pandas objects in a boolean context, use the method :meth:`~DataFrame.bool`: @@ -316,7 +312,7 @@ Occasionally you may have to deal with data that were created on a machine with a different byte order than the one on which you are running Python. A common symptom of this issue is an error like: -.. code-block:: python +.. code-block:: python-traceback Traceback ... @@ -329,8 +325,8 @@ constructors using something similar to the following: .. ipython:: python - x = np.array(list(range(10)), '>i4') # big endian - newx = x.byteswap().newbyteorder() # force native byteorder + x = np.array(list(range(10)), '>i4') # big endian + newx = x.byteswap().newbyteorder() # force native byteorder s = pd.Series(newx) See `the NumPy documentation on byte order diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index fecc336049a40..de188846cce76 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -5,15 +5,15 @@ :suppress: import numpy as np + import matplotlib.pyplot as plt + + import pandas as pd + + plt.close('all') + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import pandas as pd pd.options.display.max_rows = 15 - import matplotlib - # matplotlib.style.use('default') - import matplotlib.pyplot as plt - plt.close('all') - from collections import OrderedDict ***************************** Group By: split-apply-combine @@ -79,7 +79,7 @@ pandas objects can be split on any of their axes. The abstract definition of grouping is to provide a mapping of labels to group names. To create a GroupBy object (more on what the GroupBy object is later), you may do the following: -.. code-block:: ipython +.. code-block:: python # default is axis=0 >>> grouped = obj.groupby(key) @@ -109,12 +109,12 @@ consider the following ``DataFrame``: .. ipython:: python - df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C' : np.random.randn(8), - 'D' : np.random.randn(8)}) + df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) df On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. @@ -125,6 +125,17 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: grouped = df.groupby('A') grouped = df.groupby(['A', 'B']) +.. versionadded:: 0.24 + +If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all +but the specified columns + +.. ipython:: python + + df2 = df.set_index(['A', 'B']) + grouped = df2.groupby(level=df2.index.names.difference(['B'])) + grouped.sum() + These will split the DataFrame on its index (rows). We could also split by the columns: @@ -176,7 +187,7 @@ By default the group keys are sorted during the ``groupby`` operation. You may h .. ipython:: python - df2 = pd.DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]}) + df2 = pd.DataFrame({'X': ['B', 'B', 'A', 'A'], 'Y': [1, 2, 3, 4]}) df2.groupby(['X']).sum() df2.groupby(['X'], sort=False).sum() @@ -186,7 +197,7 @@ For example, the groups created by ``groupby()`` below are in the order they app .. ipython:: python - df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]}) + df3 = pd.DataFrame({'X': ['A', 'B', 'A', 'B'], 'Y': [1, 4, 3, 2]}) df3.groupby(['X']).get_group('A') df3.groupby(['X']).get_group('B') @@ -364,12 +375,12 @@ getting a column from a DataFrame, you can do: .. ipython:: python :suppress: - df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C' : np.random.randn(8), - 'D' : np.random.randn(8)}) + df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) .. ipython:: python @@ -399,8 +410,8 @@ natural and functions similarly to :py:func:`itertools.groupby`: In [4]: grouped = df.groupby('A') In [5]: for name, group in grouped: - ...: print(name) - ...: print(group) + ...: print(name) + ...: print(group) ...: In the case of grouping by multiple keys, the group name will be a tuple: @@ -408,8 +419,8 @@ In the case of grouping by multiple keys, the group name will be a tuple: .. ipython:: In [5]: for name, group in df.groupby(['A', 'B']): - ...: print(name) - ...: print(group) + ...: print(name) + ...: print(group) ...: See :ref:`timeseries.iterating-label`. @@ -550,8 +561,7 @@ need to rename, then you can add in a chained operation for a ``Series`` like th (grouped['C'].agg([np.sum, np.mean, np.std]) .rename(columns={'sum': 'foo', 'mean': 'bar', - 'std': 'baz'}) - ) + 'std': 'baz'})) For a grouped ``DataFrame``, you can rename in a similar manner: @@ -560,8 +570,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: (grouped.agg([np.sum, np.mean, np.std]) .rename(columns={'sum': 'foo', 'mean': 'bar', - 'std': 'baz'}) - ) + 'std': 'baz'})) Applying different functions to DataFrame columns @@ -572,8 +581,8 @@ columns of a DataFrame: .. ipython:: python - grouped.agg({'C' : np.sum, - 'D' : lambda x: np.std(x, ddof=1)}) + grouped.agg({'C': np.sum, + 'D': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be either implemented on GroupBy or available via :ref:`dispatching @@ -581,7 +590,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. ipython:: python - grouped.agg({'C' : 'sum', 'D' : 'std'}) + grouped.agg({'C': 'sum', 'D': 'std'}) .. note:: @@ -591,6 +600,8 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. ipython:: python + from collections import OrderedDict + grouped.agg({'D': 'std', 'C': 'mean'}) grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) @@ -636,13 +647,13 @@ For example, suppose we wished to standardize the data within each group: index = pd.date_range('10/1/1999', periods=1100) ts = pd.Series(np.random.normal(0.5, 2, 1100), index) - ts = ts.rolling(window=100,min_periods=100).mean().dropna() + ts = ts.rolling(window=100, min_periods=100).mean().dropna() ts.head() ts.tail() - key = lambda x: x.year - zscore = lambda x: (x - x.mean()) / x.std() - transformed = ts.groupby(key).transform(zscore) + + transformed = (ts.groupby(lambda x: x.year) + .transform(lambda x: (x - x.mean()) / x.std())) We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check: @@ -650,12 +661,12 @@ each group, which we can easily check: .. ipython:: python # Original Data - grouped = ts.groupby(key) + grouped = ts.groupby(lambda x: x.year) grouped.mean() grouped.std() # Transformed Data - grouped_trans = transformed.groupby(key) + grouped_trans = transformed.groupby(lambda x: x.year) grouped_trans.mean() grouped_trans.std() @@ -673,14 +684,16 @@ match the shape of the input array. .. ipython:: python - data_range = lambda x: x.max() - x.min() - ts.groupby(key).transform(data_range) + ts.groupby(lambda x: x.year).transform(lambda x: x.max() - x.min()) Alternatively, the built-in methods could be used to produce the same outputs. .. ipython:: python - ts.groupby(key).transform('max') - ts.groupby(key).transform('min') + max = ts.groupby(lambda x: x.year).transform('max') + min = ts.groupby(lambda x: x.year).transform('min') + + max - min Another common data transform is to replace missing data with the group mean. @@ -706,9 +719,7 @@ Another common data transform is to replace missing data with the group mean. # Non-NA count in each group grouped.count() - f = lambda x: x.fillna(x.mean()) - - transformed = grouped.transform(f) + transformed = grouped.transform(lambda x: x.fillna(x.mean())) We can verify that the group means have not changed in the transformed data and that the transformed data contains no NAs. @@ -717,12 +728,12 @@ and that the transformed data contains no NAs. grouped_trans = transformed.groupby(key) - grouped.mean() # original group means - grouped_trans.mean() # transformation did not change group means + grouped.mean() # original group means + grouped_trans.mean() # transformation did not change group means - grouped.count() # original has some missing data points - grouped_trans.count() # counts after transformation - grouped_trans.size() # Verify non-NA count equals group size + grouped.count() # original has some missing data points + grouped_trans.count() # counts after transformation + grouped_trans.size() # Verify non-NA count equals group size .. note:: @@ -775,11 +786,10 @@ missing values with the ``ffill()`` method. .. ipython:: python - df_re = pd.DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df_re = pd.DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, + freq='W'), + 'group': [1, 1, 2, 2], + 'val': [5, 6, 7, 8]}).set_index('date') df_re df_re.groupby('group').resample('1D').ffill() @@ -915,8 +925,8 @@ The dimension of the returned result can also change: In [8]: grouped = df.groupby('A')['C'] In [10]: def f(group): - ....: return pd.DataFrame({'original' : group, - ....: 'demeaned' : group - group.mean()}) + ....: return pd.DataFrame({'original': group, + ....: 'demeaned': group - group.mean()}) ....: In [11]: grouped.apply(f) @@ -927,7 +937,8 @@ that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python def f(x): - return pd.Series([ x, x**2 ], index = ['x', 'x^2']) + return pd.Series([x, x ** 2], index=['x', 'x^2']) + s = pd.Series(np.random.rand(5)) s s.apply(f) @@ -949,7 +960,7 @@ that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python - d = pd.DataFrame({"a":["x", "y"], "b":[1,2]}) + d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) def identity(df): print(df) return df @@ -984,6 +995,35 @@ Note that ``df.groupby('A').colname.std().`` is more efficient than is only interesting over one column (here ``colname``), it may be filtered *before* applying the aggregation function. +.. note:: + Any object column, also if it contains numerical values such as ``Decimal`` + objects, is considered as a "nuisance" columns. They are excluded from + aggregate functions automatically in groupby. + + If you do wish to include decimal or object columns in an aggregation with + other non-nuisance data types, you must do so explicitly. + +.. ipython:: python + + from decimal import Decimal + df_dec = pd.DataFrame( + {'id': [1, 2, 1, 2], + 'int_column': [1, 2, 3, 4], + 'dec_column': [Decimal('0.50'), Decimal('0.15'), + Decimal('0.25'), Decimal('0.40')] + } + ) + + # Decimal columns can be sum'd explicitly by themselves... + df_dec.groupby(['id'])[['dec_column']].sum() + + # ...but cannot be combined with standard data types or they will be excluded + df_dec.groupby(['id'])[['int_column', 'dec_column']].sum() + + # Use .agg function to aggregate over standard and "nuisance" data types + # at the same time + df_dec.groupby(['id']).agg({'int_column': 'sum', 'dec_column': 'sum'}) + .. _groupby.observed: Handling of (un)observed Categorical values @@ -997,19 +1037,25 @@ Show all values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], + categories=['a', 'b']), + observed=False).count() Show only the observed values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count() + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], + categories=['a', 'b']), + observed=True).count() The returned dtype of the grouped will *always* include *all* of the categories that were grouped. .. ipython:: python - s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() + s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], + categories=['a', 'b']), + observed=False).count() s.index.dtype .. _groupby.missing: @@ -1049,21 +1095,19 @@ use the ``pd.Grouper`` to provide this local control. import datetime - df = pd.DataFrame({ - 'Branch' : 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1,3,5,1,8,1,9,3], - 'Date' : [ - datetime.datetime(2013,1,1,13,0), - datetime.datetime(2013,1,1,13,5), - datetime.datetime(2013,10,1,20,0), - datetime.datetime(2013,10,2,10,0), - datetime.datetime(2013,10,1,20,0), - datetime.datetime(2013,10,2,10,0), - datetime.datetime(2013,12,2,12,0), - datetime.datetime(2013,12,2,14,0), - ] - }) + df = pd.DataFrame({'Branch': 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], + 'Date': [ + datetime.datetime(2013, 1, 1, 13, 0), + datetime.datetime(2013, 1, 1, 13, 5), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 12, 2, 12, 0), + datetime.datetime(2013, 12, 2, 14, 0)] + }) df @@ -1071,7 +1115,7 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer']).sum() You have an ambiguous specification in that you have a named index and a column that could be potential groupers. @@ -1080,9 +1124,9 @@ that could be potential groupers. df = df.set_index('Date') df['Date'] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq='6M',key='Date'),'Buyer']).sum() + df.groupby([pd.Grouper(freq='6M', key='Date'), 'Buyer']).sum() - df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum() + df.groupby([pd.Grouper(freq='6M', level='Date'), 'Buyer']).sum() Taking the first rows of each group @@ -1139,7 +1183,7 @@ As with other methods, passing ``as_index=False``, will achieve a filtration, wh .. ipython:: python df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A',as_index=False) + g = df.groupby('A', as_index=False) g.nth(0) g.nth(-1) @@ -1250,12 +1294,11 @@ code more readable. First we set the data: .. ipython:: python - import numpy as np n = 1000 df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), 'Product': np.random.choice(['Product_1', 'Product_2'], n), - 'Revenue': (np.random.random(n)*50+10).round(2), + 'Revenue': (np.random.random(n) * 50 + 10).round(2), 'Quantity': np.random.randint(1, 10, size=n)}) df.head(2) @@ -1264,7 +1307,7 @@ Now, to find prices per store/product, we can simply do: .. ipython:: python (df.groupby(['Store', 'Product']) - .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum()) + .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) .unstack().round(2)) Piping can also be expressive when you want to deliver a grouped object to some @@ -1272,7 +1315,7 @@ arbitrary function, for example: .. code-block:: python - (df.groupby(['Store', 'Product']).pipe(report_func) + df.groupby(['Store', 'Product']).pipe(report_func) where ``report_func`` takes a GroupBy object and creates a report from that. @@ -1287,7 +1330,8 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on .. ipython:: python - df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]}) + df = pd.DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], + 'c': [1, 0, 0], 'd': [2, 3, 4]}) df df.groupby(df.sum(), axis=1).sum() @@ -1331,7 +1375,7 @@ In the following examples, **df.index // 5** returns a binary array which is use .. ipython:: python - df = pd.DataFrame(np.random.randn(10,2)) + df = pd.DataFrame(np.random.randn(10, 2)) df df.index // 5 df.groupby(df.index // 5).std() @@ -1346,12 +1390,10 @@ column index name will be used as the name of the inserted column: .. ipython:: python - df = pd.DataFrame({ - 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], - 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], - 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], - 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], - }) + df = pd.DataFrame({'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]}) def compute_metrics(x): result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index f5ac7b77f4db1..38f73f8617ced 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -14,15 +14,15 @@ pandas: powerful Python data analysis toolkit **Binary Installers:** https://pypi.org/project/pandas -**Source Repository:** http://github.com/pandas-dev/pandas +**Source Repository:** https://github.com/pandas-dev/pandas **Issues & Ideas:** https://github.com/pandas-dev/pandas/issues -**Q&A Support:** http://stackoverflow.com/questions/tagged/pandas +**Q&A Support:** https://stackoverflow.com/questions/tagged/pandas -**Developer Mailing List:** http://groups.google.com/group/pydata +**Developer Mailing List:** https://groups.google.com/forum/#!forum/pydata -**pandas** is a `Python `__ package providing fast, +**pandas** is a `Python `__ package providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, **real world** data @@ -45,7 +45,7 @@ and :class:`DataFrame` (2-dimensional), handle the vast majority of typical use cases in finance, statistics, social science, and many areas of engineering. For R users, :class:`DataFrame` provides everything that R's ``data.frame`` provides and much more. pandas is built on top of `NumPy -`__ and is intended to integrate well within a scientific +`__ and is intended to integrate well within a scientific computing environment with many other 3rd party libraries. Here are just a few of the things that pandas does well: @@ -86,13 +86,13 @@ is the ideal tool for all of these tasks. Some other notes - pandas is **fast**. Many of the low-level algorithmic bits have been - extensively tweaked in `Cython `__ code. However, as with + extensively tweaked in `Cython `__ code. However, as with anything else generalization usually sacrifices performance. So if you focus on one feature for your application you may be able to create a faster specialized tool. - pandas is a dependency of `statsmodels - `__, making it an important part of the + `__, making it an important part of the statistical computing ecosystem in Python. - pandas has been used extensively in production in financial applications. @@ -101,7 +101,7 @@ Some other notes This documentation assumes general familiarity with NumPy. If you haven't used NumPy much or at all, do invest some time in `learning about NumPy - `__ first. + `__ first. See the package overview for more detail about what's in the library. @@ -118,7 +118,7 @@ See the package overview for more detail about what's in the library. {{ single_doc }} {% endif -%} {% if not single_doc -%} - whatsnew + What's New install contributing overview @@ -159,5 +159,5 @@ See the package overview for more detail about what's in the library. developer internals extending - release + releases {% endif -%} diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 1c63acce6e3fa..6ad9c573249a3 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -190,7 +190,7 @@ columns. .. ipython:: python - df.loc[:,['B', 'A']] = df[['A', 'B']].values + df.loc[:,['B', 'A']] = df[['A', 'B']].to_numpy() df[['A', 'B']] @@ -537,10 +537,10 @@ A list of indexers where any element is out of bounds will raise an .. code-block:: python - dfl.iloc[[4, 5, 6]] + >>> dfl.iloc[[4, 5, 6]] IndexError: positional indexers are out-of-bounds - dfl.iloc[:, 4] + >>> dfl.iloc[:, 4] IndexError: single positional indexer is out-of-bounds .. _indexing.callable: @@ -1571,9 +1571,9 @@ Setting metadata Indexes are "mostly immutable", but it is possible to set and change their metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and -``labels``). +``codes``). -You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_labels`` +You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_codes`` to set these attributes directly. They default to returning a copy; however, you can specify ``inplace=True`` to have the data change in place. @@ -1588,7 +1588,7 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. ind.name = "bob" ind -``set_names``, ``set_levels``, and ``set_labels`` also take an optional +``set_names``, ``set_levels``, and ``set_codes`` also take an optional `level`` argument .. ipython:: python @@ -1794,7 +1794,7 @@ interpreter executes this code: .. code-block:: python - dfmi.loc[:,('one','second')] = value + dfmi.loc[:, ('one', 'second')] = value # becomes dfmi.loc.__setitem__((slice(None), ('one', 'second')), value) @@ -1827,10 +1827,10 @@ that you've done this: .. code-block:: python def do_something(df): - foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows! - # ... many lines here ... - foo['quux'] = value # We don't know whether this will modify df or not! - return foo + foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows! + # ... many lines here ... + foo['quux'] = value # We don't know whether this will modify df or not! + return foo Yikes! diff --git a/doc/source/install.rst b/doc/source/install.rst index 7a846c817aee2..4a71dbcec17e6 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -225,7 +225,7 @@ Dependencies ------------ * `setuptools `__: 24.2.0 or higher -* `NumPy `__: 1.9.0 or higher +* `NumPy `__: 1.12.0 or higher * `python-dateutil `__: 2.5.0 or higher * `pytz `__ @@ -236,11 +236,11 @@ Recommended Dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.4.6 or higher. + If installed, must be Version 2.6.1 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, - must be Version 1.0.0 or higher. + must be Version 1.2.0 or higher. .. note:: @@ -255,21 +255,21 @@ Optional Dependencies * `Cython `__: Only necessary to build development version. Version 0.28.2 or higher. -* `SciPy `__: miscellaneous statistical functions, Version 0.14.0 or higher +* `SciPy `__: miscellaneous statistical functions, Version 0.18.1 or higher * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. -* `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. -* `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. -* `Apache Parquet `__, either `pyarrow `__ (>= 0.4.1) or `fastparquet `__ (>= 0.0.6) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. +* `PyTables `__: necessary for HDF5-based storage, Version 3.4.2 or higher +* `pyarrow `__ (>= 0.7.0): necessary for feather-based storage. +* `Apache Parquet `__, either `pyarrow `__ (>= 0.7.0) or `fastparquet `__ (>= 0.1.2) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: * `psycopg2 `__: for PostgreSQL * `pymysql `__: for MySQL. * `SQLite `__: for SQLite, this is included in Python's standard library by default. -* `matplotlib `__: for plotting, Version 1.4.3 or higher. +* `matplotlib `__: for plotting, Version 2.0.0 or higher. * For Excel I/O: - * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) + * `xlrd/xlwt `__: Excel reading (xlrd), version 1.0.0 or higher required, and writing (xlwt) * `openpyxl `__: openpyxl version 2.4.0 for writing .xlsx files (xlrd >= 0.9.0) * `XlsxWriter `__: Alternative Excel writer @@ -286,7 +286,9 @@ Optional Dependencies `xsel `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* `pandas-gbq `__: for Google BigQuery I/O. +* `pandas-gbq + `__: + for Google BigQuery I/O. (pandas-gbq >= 0.8.0) * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. diff --git a/doc/source/internals.rst b/doc/source/internals.rst index fce99fc633440..c39dafa88db92 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -6,9 +6,10 @@ :suppress: import numpy as np + import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import pandas as pd pd.options.display.max_rows = 15 ********* @@ -73,22 +74,23 @@ MultiIndex ~~~~~~~~~~ Internally, the ``MultiIndex`` consists of a few things: the **levels**, the -integer **labels**, and the level **names**: +integer **codes** (until version 0.24 named *labels*), and the level **names**: .. ipython:: python - index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) + index = pd.MultiIndex.from_product([range(3), ['one', 'two']], + names=['first', 'second']) index index.levels - index.labels + index.codes index.names -You can probably guess that the labels determine which unique element is +You can probably guess that the codes determine which unique element is identified with that location at each layer of the index. It's important to -note that sortedness is determined **solely** from the integer labels and does +note that sortedness is determined **solely** from the integer codes and does not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but -if you compute the levels and labels yourself, please be careful. +if you compute the levels and codes yourself, please be careful. Values ~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 039cba2993381..313c4d723d079 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -5,25 +5,23 @@ .. ipython:: python :suppress: - import os import csv - from pandas.compat import StringIO, BytesIO - import pandas as pd - ExcelWriter = pd.ExcelWriter + import os + import matplotlib.pyplot as plt import numpy as np - np.random.seed(123456) + import pandas as pd + from pandas.compat import StringIO, BytesIO + + randn = np.random.randn np.set_printoptions(precision=4, suppress=True) - - import matplotlib.pyplot as plt plt.close('all') - - import pandas.util.testing as tm pd.options.display.max_rows = 15 clipdf = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['p', 'q', 'r']}, index=['x', 'y', 'z']) + =============================== IO Tools (Text, CSV, HDF5, ...) =============================== @@ -40,14 +38,14 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like :delim: ; text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` - text;`JSON `__;:ref:`read_json`;:ref:`to_json` + text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` + binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle` @@ -146,7 +144,10 @@ usecols : list-like or callable, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) @@ -192,7 +193,10 @@ skiprows : list-like or integer, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) @@ -367,7 +371,10 @@ columns: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) df = pd.read_csv(StringIO(data), dtype=object) @@ -388,7 +395,11 @@ of :func:`~pandas.read_csv`: .. ipython:: python - data = "col_1\n1\n2\n'A'\n4.22" + data = ("col_1\n" + "1\n" + "2\n" + "'A'\n" + "4.22") df = pd.read_csv(StringIO(data), converters={'col_1': str}) df df['col_1'].apply(type).value_counts() @@ -427,7 +438,8 @@ worth trying. .. ipython:: python :okwarning: - df = pd.DataFrame({'col_1': list(range(500000)) + ['a', 'b'] + list(range(500000))}) + col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000)) + df = pd.DataFrame({'col_1': col_1}) df.to_csv('foo.csv') mixed_df = pd.read_csv('foo.csv') mixed_df['col_1'].apply(type).value_counts() @@ -455,7 +467,10 @@ Specifying Categorical dtype .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data)).dtypes @@ -479,7 +494,6 @@ that column's ``dtype``. .. ipython:: python from pandas.api.types import CategoricalDtype - dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes @@ -525,7 +539,10 @@ used as the column names: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) pd.read_csv(StringIO(data)) @@ -544,7 +561,11 @@ If the header is in a row other than the first, pass the row number to .. ipython:: python - data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('skip this skip it\n' + 'a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') pd.read_csv(StringIO(data), header=1) .. note:: @@ -565,7 +586,9 @@ distinguish between them so as to prevent overwriting data: .. ipython :: python - data = 'a,b,a\n0,1,2\n3,4,5' + data = ('a,b,a\n' + '0,1,2\n' + '3,4,5') pd.read_csv(StringIO(data)) There is no more duplicate data because ``mangle_dupe_cols=True`` by default, @@ -633,7 +656,13 @@ be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python - data = '\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6' + data = ('\n' + 'a,b,c\n' + ' \n' + '# commented line\n' + '1,2,3\n' + '\n' + '4,5,6') print(data) pd.read_csv(StringIO(data), comment='#') @@ -641,7 +670,12 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = 'a,b,c\n\n1,2,3\n\n\n4,5,6' + data = ('a,b,c\n' + '\n' + '1,2,3\n' + '\n' + '\n' + '4,5,6') pd.read_csv(StringIO(data), skip_blank_lines=False) .. warning:: @@ -652,20 +686,32 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = '#comment\na,b,c\nA,B,C\n1,2,3' + data = ('#comment\n' + 'a,b,c\n' + 'A,B,C\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', header=1) - data = 'A,B,C\n#comment\na,b,c\n1,2,3' + data = ('A,B,C\n' + '#comment\n' + 'a,b,c\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', skiprows=2) If both ``header`` and ``skiprows`` are specified, ``header`` will be relative to the end of ``skiprows``. For example: - .. ipython:: python +.. ipython:: python - data = '# empty\n# second empty line\n# third empty' \ - 'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0' - print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + data = ('# empty\n' + '# second empty line\n' + '# third emptyline\n' + 'X,Y,Z\n' + '1,2,3\n' + 'A,B,C\n' + '1,2.,4.\n' + '5.,NaN,10.0\n') + print(data) + pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) .. _io.comments: @@ -677,10 +723,10 @@ Sometimes comments or meta data may be included in a file: .. ipython:: python :suppress: - data = ("ID,level,category\n" - "Patient1,123000,x # really unpleasant\n" - "Patient2,23000,y # wouldn't take his medicine\n" - "Patient3,1234018,z # awesome") + data = ("ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -718,7 +764,10 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python - data = b'word,length\nTr\xc3\xa4umen,7\nGr\xc3\xbc\xc3\x9fe,5'.decode('utf8').encode('latin-1') + data = (b'word,length\n' + b'Tr\xc3\xa4umen,7\n' + b'Gr\xc3\xbc\xc3\x9fe,5') + data = data.decode('utf8').encode('latin-1') df = pd.read_csv(BytesIO(data), encoding='latin-1') df df['word'][1] @@ -738,12 +787,16 @@ first column will be used as the ``DataFrame``'s row names: .. ipython:: python - data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data)) .. ipython:: python - data = 'index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('index,a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data), index_col=0) Ordinarily, you can achieve this behavior using the ``index_col`` option. @@ -754,7 +807,9 @@ index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), index_col=False) @@ -764,7 +819,9 @@ If a subset of data is being parsed using the ``usecols`` option, the .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data), usecols=['b', 'c']) pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0) @@ -812,12 +869,12 @@ column names: .. ipython:: python :suppress: - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -895,9 +952,8 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python - import pandas.io.date_converters as conv df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) + date_parser=pd.io.date_converters.parse_date_time) df Pandas will try to call the ``date_parser`` function in three different ways. If @@ -990,9 +1046,12 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python :suppress: - data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + data = ("date,value,cat\n" + "1/6/2000,5,a\n" + "2/6/2000,10,b\n" + "3/6/2000,15,c") with open('tmp.csv', 'w') as fh: - fh.write(data) + fh.write(data) .. ipython:: python @@ -1016,9 +1075,12 @@ writing to a file). For example: val = '0.3066101993807095471566981359501369297504425048828125' data = 'a,b,c\n1,2,{0}'.format(val) - abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision=None)['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='high')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='round_trip')['c'][0] - float(val)) .. _io.thousands: @@ -1033,10 +1095,10 @@ correctly: .. ipython:: python :suppress: - data = ("ID|level|category\n" - "Patient1|123,000|x\n" - "Patient2|23,000|y\n" - "Patient3|1,234,018|z") + data = ("ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1089,7 +1151,7 @@ Let us consider some examples: .. code-block:: python - read_csv(path, na_values=[5]) + pd.read_csv('path_to_file.csv', na_values=[5]) In the example above ``5`` and ``5.0`` will be recognized as ``NaN``, in addition to the defaults. A string will first be interpreted as a numerical @@ -1097,19 +1159,19 @@ addition to the defaults. A string will first be interpreted as a numerical .. code-block:: python - read_csv(path, keep_default_na=False, na_values=[""]) + pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=[""]) Above, only an empty field will be recognized as ``NaN``. .. code-block:: python - read_csv(path, keep_default_na=False, na_values=["NA", "0"]) + pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=["NA", "0"]) Above, both ``NA`` and ``0`` as strings are ``NaN``. .. code-block:: python - read_csv(path, na_values=["Nope"]) + pd.read_csv('path_to_file.csv', na_values=["Nope"]) The default values, in addition to the string ``"Nope"`` are recognized as ``NaN``. @@ -1132,10 +1194,10 @@ as a ``Series``: .. ipython:: python :suppress: - data = ("level\n" - "Patient1,123000\n" - "Patient2,23000\n" - "Patient3,1234018") + data = ("level\n" + "Patient1,123000\n" + "Patient2,23000\n" + "Patient3,1234018") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1144,7 +1206,7 @@ as a ``Series``: print(open('tmp.csv').read()) - output = pd.read_csv('tmp.csv', squeeze=True) + output = pd.read_csv('tmp.csv', squeeze=True) output type(output) @@ -1166,7 +1228,9 @@ options as follows: .. ipython:: python - data= 'a,b,c\n1,Yes,2\n3,No,4' + data = ('a,b,c\n' + '1,Yes,2\n' + '3,No,4') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) @@ -1181,18 +1245,13 @@ too few fields will have NA values filled in the trailing fields. Lines with too many fields will raise an error by default: .. ipython:: python - :suppress: - - data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + :okexcept: -.. code-block:: ipython - - In [27]: data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' - - In [28]: pd.read_csv(StringIO(data)) - --------------------------------------------------------------------------- - ParserError Traceback (most recent call last) - ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4 + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6,7\n' + '8,9,10') + pd.read_csv(StringIO(data)) You can elect to skip bad lines: @@ -1437,7 +1496,7 @@ returned object: .. ipython:: python - df = pd.read_csv("data/mindex_ex.csv", index_col=[0,1]) + df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1]) df df.loc[1978] @@ -1480,7 +1539,6 @@ with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index wi .. ipython:: python :suppress: - import os os.remove('mi.csv') os.remove('mi2.csv') @@ -1580,12 +1638,19 @@ You can pass in a URL to a CSV file: df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', sep='\t') -S3 URLs are handled as well: +S3 URLs are handled as well but require installing the `S3Fs +`_ library: .. code-block:: python df = pd.read_csv('s3://pandas-test/tips.csv') +If your S3 bucket requires cedentials you will need to set them as environment +variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs +documentation on credentials +`_. + + Writing out Data '''''''''''''''' @@ -1603,7 +1668,7 @@ function takes a number of arguments. Only the first is required. * ``sep`` : Field delimiter for the output file (default ",") * ``na_rep``: A string representation of a missing value (default '') * ``float_format``: Format string for floating point numbers -* ``cols``: Columns to write (default None) +* ``columns``: Columns to write (default None) * ``header``: Whether to write out the column names (default True) * ``index``: whether to write row (index) names (default True) * ``index_label``: Column label(s) for index column(s) if desired. If None @@ -1834,8 +1899,7 @@ For example: .. code-block:: python - DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json() # raises - + >>> DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json() # raises RuntimeError: Unhandled numpy dtype 15 can be dealt with by specifying a simple ``default_handler``: @@ -1960,9 +2024,8 @@ Preserve string indices: .. ipython:: python - si = pd.DataFrame(np.zeros((4, 4)), - columns=list(range(4)), - index=[str(i) for i in range(4)]) + si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), + index=[str(i) for i in range(4)]) si si.index si.columns @@ -2014,11 +2077,11 @@ data: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) The speedup is less noticeable for smaller datasets: @@ -2028,11 +2091,11 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) .. warning:: @@ -2053,7 +2116,6 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python :suppress: - import os os.remove('test.json') .. _io.json_normalize: @@ -2075,20 +2137,16 @@ into a flat table. .. ipython:: python data = [{'state': 'Florida', - 'shortname': 'FL', - 'info': { - 'governor': 'Rick Scott' - }, - 'counties': [{'name': 'Dade', 'population': 12345}, + 'shortname': 'FL', + 'info': {'governor': 'Rick Scott'}, + 'counties': [{'name': 'Dade', 'population': 12345}, {'name': 'Broward', 'population': 40000}, {'name': 'Palm Beach', 'population': 60000}]}, - {'state': 'Ohio', - 'shortname': 'OH', - 'info': { - 'governor': 'John Kasich' - }, - 'counties': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] + {'state': 'Ohio', + 'shortname': 'OH', + 'info': {'governor': 'John Kasich'}, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) @@ -2136,11 +2194,10 @@ a JSON string with two fields, ``schema`` and ``data``. .. ipython:: python - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3), - }, index=pd.Index(range(3), name='idx')) + df = pd.DataFrame({'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + index=pd.Index(range(3), name='idx')) df df.to_json(orient='table', date_format="iso") @@ -2273,7 +2330,7 @@ indicate missing values and the subsequent read cannot distinguish the intent. new_df = pd.read_json('test.json', orient='table') print(new_df.index.name) -.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/ +.. _Table Schema: https://specs.frictionlessdata.io/json-table-schema/ HTML ---- @@ -2301,7 +2358,7 @@ Read a URL with no options: .. ipython:: python - url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' + url = 'https://www.fdic.gov/bank/individual/failed/banklist.html' dfs = pd.read_html(url) dfs @@ -2316,7 +2373,6 @@ as a string: .. ipython:: python :suppress: - import os file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) .. ipython:: python @@ -2341,7 +2397,7 @@ You can even pass in an instance of ``StringIO`` if you so desire: that having so many network-accessing functions slows down the documentation build. If you spot an error or an example that doesn't run, please do not hesitate to report it over on `pandas GitHub issues page - `__. + `__. Read a URL and match a table that contains specific text: @@ -2411,8 +2467,8 @@ columns to strings. .. code-block:: python url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' - dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': - str}) + dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, + converters={'MNC': str}) .. versionadded:: 0.19 @@ -2650,16 +2706,16 @@ parse HTML tables in the top-level pandas io function ``read_html``. .. |svm| replace:: **strictly valid markup** -.. _svm: http://validator.w3.org/docs/help.html#validation_basics +.. _svm: https://validator.w3.org/docs/help.html#validation_basics .. |html5lib| replace:: **html5lib** .. _html5lib: https://github.com/html5lib/html5lib-python .. |BeautifulSoup4| replace:: **BeautifulSoup4** -.. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup +.. _BeautifulSoup4: https://www.crummy.com/software/BeautifulSoup .. |lxml| replace:: **lxml** -.. _lxml: http://lxml.de +.. _lxml: https://lxml.de @@ -2687,7 +2743,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', sheet_name='Sheet1') + pd.read_excel('path_to_file.xls', sheet_name='Sheet1') .. _io.excel.excelfile_class: @@ -2724,7 +2780,8 @@ different parameters: data = {} # For when Sheet1's format differs from Sheet2 with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, na_values=['NA']) + data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, + na_values=['NA']) data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=1) Note that if the same parsing parameters are used for all sheets, a list @@ -2735,11 +2792,14 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # using the ExcelFile class data = {} with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = read_excel(xls, 'Sheet1', index_col=None, na_values=['NA']) - data['Sheet2'] = read_excel(xls, 'Sheet2', index_col=None, na_values=['NA']) + data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, + na_values=['NA']) + data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=None, + na_values=['NA']) # equivalent using the read_excel function - data = read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], index_col=None, na_values=['NA']) + data = pd.read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], + index_col=None, na_values=['NA']) .. _io.excel.specifying_sheets: @@ -2761,35 +2821,35 @@ Specifying Sheets .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) Using the sheet index: .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + pd.read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) Using all default values: .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls') + pd.read_excel('path_to_file.xls') Using None to get all sheets: .. code-block:: python # Returns a dictionary of DataFrames - read_excel('path_to_file.xls', sheet_name=None) + pd.read_excel('path_to_file.xls', sheet_name=None) Using a list to get multiple sheets: .. code-block:: python # Returns the 1st and 4th sheet, as a dictionary of DataFrames. - read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) + pd.read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. @@ -2810,8 +2870,8 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a':[1, 2, 3, 4], 'b':[5, 6, 7, 8]}, - index=pd.MultiIndex.from_product([['a', 'b'],['c', 'd']])) + df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']])) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) df @@ -2832,7 +2892,8 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], names=['c1', 'c2']) + df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], + names=['c1', 'c2']) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) df @@ -2840,7 +2901,6 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python :suppress: - import os os.remove('path_to_file.xlsx') @@ -2851,22 +2911,54 @@ It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. ``read_excel`` takes a ``usecols`` keyword to allow you to specify a subset of columns to parse. +.. deprecated:: 0.24.0 + +Passing in an integer for ``usecols`` has been deprecated. Please pass in a list +of ints from 0 to ``usecols`` inclusive instead. + If ``usecols`` is an integer, then it is assumed to indicate the last column to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=2) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=2) + +You can also specify a comma-delimited set of Excel columns and ranges as a string: + +.. code-block:: python + + pd.read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') -If `usecols` is a list of integers, then it is assumed to be the file column +If ``usecols`` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. +.. versionadded:: 0.24 + +If ``usecols`` is a list of strings, it is assumed that each string corresponds +to a column name provided either by the user in ``names`` or inferred from the +document header row(s). Those strings define which columns will be parsed: + +.. code-block:: python + + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + +Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. + +.. versionadded:: 0.24 + +If ``usecols`` is callable, the callable function will be evaluated against +the column names, returning names where the callable function evaluates to ``True``. + +.. code-block:: python + + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + Parsing Dates +++++++++++++ @@ -2877,7 +2969,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) Cell Converters @@ -2888,7 +2980,7 @@ option. For instance, to convert a column to boolean: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) This options handles missing values and treats exceptions in the converters as missing data. Transformations are applied cell by cell rather than to the @@ -2899,8 +2991,11 @@ missing data to recover integer dtype: .. code-block:: python - cfun = lambda x: int(x) if x else -1 - read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + def cfun(x): + return int(x) if x else -1 + + + pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) dtype Specifications ++++++++++++++++++++ @@ -2914,7 +3009,7 @@ no type inference, use the type ``str`` or ``object``. .. code-block:: python - read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + pd.read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) .. _io.excel_writer: @@ -2952,7 +3047,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python - with ExcelWriter('path_to_file.xlsx') as writer: + with pd.ExcelWriter('path_to_file.xlsx') as writer: df1.to_excel(writer, sheet_name='Sheet1') df2.to_excel(writer, sheet_name='Sheet2') @@ -2984,7 +3079,7 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` bio = BytesIO() # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter(bio, engine='xlsxwriter') + writer = pd.ExcelWriter(bio, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1') # Save the workbook @@ -3037,10 +3132,10 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. - from pandas import options + from pandas import options # noqa: E402 options.io.excel.xlsx.writer = 'xlsxwriter' df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') @@ -3067,7 +3162,7 @@ which takes the contents of the clipboard buffer and passes them to the ``read_csv`` method. For instance, you can copy the following text to the clipboard (CTRL-C on many operating systems): -.. code-block:: python +.. code-block:: console A B C x 1 4 p @@ -3127,7 +3222,6 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python :suppress: - import os os.remove('foo.pkl') .. warning:: @@ -3143,10 +3237,10 @@ any pickled pandas object (or any other pickled object) from file: for such cases, pickled ``DataFrames``, ``Series`` etc, must be read with ``pd.read_pickle``, rather than ``pickle.load``. - See `here `__ - and `here `__ + See `here `__ + and `here `__ for some examples of compatibility-breaking changes. See - `this question `__ + `this question `__ for a detailed explanation. .. _io.pickle.compression: @@ -3204,7 +3298,6 @@ The default is to 'infer': .. ipython:: python :suppress: - import os os.remove("data.pkl.compress") os.remove("data.pkl.xz") os.remove("data.pkl.gz") @@ -3261,7 +3354,7 @@ pandas objects. .. ipython:: python - pd.to_msgpack('foo2.msg', {'dict': [{ 'df': df }, {'string': 'foo'}, + pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'}, {'scalar': 1.}, {'s': s}]}) pd.read_msgpack('foo2.msg') @@ -3294,7 +3387,7 @@ HDF5 (PyTables) ``HDFStore`` is a dict-like object which reads and writes pandas using the high performance HDF5 format using the excellent `PyTables -`__ library. See the :ref:`cookbook ` +`__ library. See the :ref:`cookbook ` for some advanced strategies .. warning:: @@ -3320,7 +3413,6 @@ dict: .. ipython:: python - np.random.seed(1234) index = pd.date_range('1/1/2000', periods=8) s = pd.Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) df = pd.DataFrame(randn(8, 3), index=index, @@ -3376,7 +3468,6 @@ Closing a Store and using a context manager: :suppress: store.close() - import os os.remove('store.h5') @@ -3389,8 +3480,8 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python - df_tl = pd.DataFrame(dict(A=list(range(5)), B=list(range(5)))) - df_tl.to_hdf('store_tl.h5','table', append=True) + df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))}) + df_tl.to_hdf('store_tl.h5', 'table', append=True) pd.read_hdf('store_tl.h5', 'table', where=['index>2']) .. ipython:: python @@ -3402,10 +3493,6 @@ similar to how ``read_csv`` and ``to_csv`` work. HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting ``dropna=True``. -.. ipython:: python - :suppress: - - import os .. ipython:: python @@ -3414,12 +3501,12 @@ HDFStore will by default not drop rows that are all missing. This behavior can b df_with_missing df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w') + format='table', mode='w') pd.read_hdf('file.h5', 'df_with_missing') df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w', dropna=True) + format='table', mode='w', dropna=True) pd.read_hdf('file.h5', 'df_with_missing') @@ -3433,13 +3520,13 @@ This is also true for the major axis of a ``Panel``: .. ipython:: python matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]], - [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], - [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] + [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], + [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] - panel_with_major_axis_all_missing=pd.Panel(matrix, - items=['Item1', 'Item2', 'Item3'], - major_axis=[1, 2], - minor_axis=['A', 'B', 'C']) + panel_with_major_axis_all_missing = pd.Panel(matrix, + items=['Item1', 'Item2', 'Item3'], + major_axis=[1, 2], + minor_axis=['A', 'B', 'C']) panel_with_major_axis_all_missing @@ -3476,9 +3563,8 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. code-block:: python - pd.DataFrame(randn(10, 2)).to_hdf('test_fixed.h5', 'df') - - pd.read_hdf('test_fixed.h5', 'df', where='index>5') + >>> pd.DataFrame(randn(10, 2)).to_hdf('test_fixed.h5', 'df') + >>> pd.read_hdf('test_fixed.h5', 'df', where='index>5') TypeError: cannot pass a where specification when reading a fixed format. this store must be selected in its entirety @@ -3541,7 +3627,7 @@ everything in the sub-store and **below**, so be *careful*. store.put('foo/bar/bah', df) store.append('food/orange', df) - store.append('food/apple', df) + store.append('food/apple', df) store # a list of keys are returned @@ -3574,7 +3660,7 @@ will yield a tuple for each group key along with the relative keys of its conten Hierarchical keys cannot be retrieved as dotted (attribute) access as described above for items stored under the root node. - .. code-block:: python + .. code-block:: ipython In [8]: store.foo.bar.bah AttributeError: 'HDFStore' object has no attribute 'foo' @@ -3616,14 +3702,15 @@ defaults to `nan`. df_mixed = pd.DataFrame({'A': randn(8), 'B': randn(8), 'C': np.array(randn(8), dtype='float32'), - 'string':'string', + 'string': 'string', 'int': 1, 'bool': True, 'datetime64': pd.Timestamp('20010102')}, index=list(range(8))) - df_mixed.loc[df_mixed.index[3:5], ['A', 'B', 'string', 'datetime64']] = np.nan + df_mixed.loc[df_mixed.index[3:5], + ['A', 'B', 'string', 'datetime64']] = np.nan - store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) + store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) df_mixed1 = store.select('df_mixed') df_mixed1 df_mixed1.get_dtype_counts() @@ -3641,8 +3728,8 @@ storing/selecting from homogeneous index ``DataFrames``. index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) @@ -3732,10 +3819,10 @@ The right-hand side of the sub-expression (after a comparison operator) can be: instead of this - .. code-block:: python + .. code-block:: ipython string = "HolyMoly'" - store.select('df', 'index == %s' % string) + store.select('df', 'index == %s' % string) The latter will **not** work and will raise a ``SyntaxError``.Note that there's a single quote followed by a double quote in the ``string`` @@ -3776,7 +3863,8 @@ Works with a Panel as well. store.append('wp', wp) store - store.select('wp', "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") + store.select('wp', + "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3819,7 +3907,10 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i, seconds=10) for i in range(10) ])) + dftd = pd.DataFrame({'A': pd.Timestamp('20130101'), + 'B': [pd.Timestamp('20130101') + timedelta(days=i, + seconds=10) + for i in range(10)]}) dftd['C'] = dftd['A'] - dftd['B'] dftd store.append('dftd', dftd, data_columns=True) @@ -3878,7 +3969,7 @@ Then create the index when finished appending. os.remove('appends.h5') -See `here `__ for how to create a completely-sorted-index (CSI) on an existing store. +See `here `__ for how to create a completely-sorted-index (CSI) on an existing store. .. _io.hdf5-query-data-columns: @@ -3896,14 +3987,14 @@ be ``data_columns``. df_dc = df.copy() df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4: 6], 'string'] = np.nan - df_dc.loc[df_dc.index[7: 9], 'string'] = 'bar' + df_dc.loc[df_dc.index[4:6], 'string'] = np.nan + df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1: 3], ['B', 'C']] = 1.0 + df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0 df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2']) store.select('df_dc', where='B > 0') # getting creative @@ -3932,7 +4023,7 @@ The default is 50,000 rows returned in a chunk. .. ipython:: python for df in store.select('df', chunksize=3): - print(df) + print(df) .. note:: @@ -3941,7 +4032,7 @@ The default is 50,000 rows returned in a chunk. .. code-block:: python - for df in pd.read_hdf('store.h5','df', chunksize=3): + for df in pd.read_hdf('store.h5', 'df', chunksize=3): print(df) Note, that the chunksize keyword applies to the **source** rows. So if you @@ -3959,12 +4050,12 @@ chunks. store.append('dfeq', dfeq, data_columns=['number']) def chunks(l, n): - return [l[i: i+n] for i in range(0, len(l), n)] + return [l[i:i + n] for i in range(0, len(l), n)] evens = [2, 4, 6, 8, 10] coordinates = store.select_as_coordinates('dfeq', 'number=evens') for c in chunks(coordinates, 2): - print(store.select('dfeq', where=c)) + print(store.select('dfeq', where=c)) Advanced Queries ++++++++++++++++ @@ -4061,13 +4152,13 @@ results. .. ipython:: python df_mt = pd.DataFrame(randn(8, 6), index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, - df_mt, selector='df1_mt') + store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, + df_mt, selector='df1_mt') store # individual tables were created @@ -4076,7 +4167,7 @@ results. # as a multiple store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector = 'df1_mt') + selector='df1_mt') Delete from a Table @@ -4115,7 +4206,7 @@ the table using a ``where`` that selects all but the missing data. .. ipython:: python # returns the number of rows deleted - store.remove('wp', 'major_axis > 20000102' ) + store.remove('wp', 'major_axis > 20000102') store.select('wp') .. warning:: @@ -4151,8 +4242,8 @@ control compression: ``complevel`` and ``complib``. compression to choose depends on your specific needs and data. The list of supported compression libraries: - - `zlib `_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow. - - `lzo `_: Fast compression and decompression. + - `zlib `_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow. + - `lzo `_: Fast compression and decompression. - `bzip2 `_: Good compression rates. - `blosc `_: Fast compression and decompression. @@ -4171,7 +4262,7 @@ control compression: ``complevel`` and ``complib``. compression ratios at the expense of speed. - `blosc:snappy `_: A popular compressor used in many places. - - `blosc:zlib `_: A classic; + - `blosc:zlib `_: A classic; somewhat slower than the previous ones, but achieving better compression ratios. - `blosc:zstd `_: An @@ -4288,7 +4379,7 @@ stored in a more efficient manner. .. ipython:: python dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), - 'B': np.random.randn(8) }) + 'B': np.random.randn(8)}) dfcat dfcat.dtypes cstore = pd.HDFStore('cats.h5', mode='w') @@ -4302,7 +4393,6 @@ stored in a more efficient manner. :okexcept: cstore.close() - import os os.remove('cats.h5') @@ -4330,7 +4420,7 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame(dict(A='foo', B='bar'), index=list(range(5))) + dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5))) dfs # A and B have a size of 30 @@ -4349,7 +4439,7 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame(dict(A=['foo', 'bar', 'nan'])) + dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']}) dfss store.append('dfss', dfss) @@ -4372,15 +4462,14 @@ tables. It is possible to write an ``HDFStore`` object that can easily be imported into ``R`` using the ``rhdf5`` library (`Package website`_). Create a table format store like this: -.. _package website: http://www.bioconductor.org/packages/release/bioc/html/rhdf5.html +.. _package website: https://www.bioconductor.org/packages/release/bioc/html/rhdf5.html .. ipython:: python - np.random.seed(1) df_for_r = pd.DataFrame({"first": np.random.rand(100), "second": np.random.rand(100), "class": np.random.randint(0, 2, (100, ))}, - index=range(100)) + index=range(100)) df_for_r.head() store_export = pd.HDFStore('export.h5') @@ -4391,7 +4480,6 @@ It is possible to write an ``HDFStore`` object that can easily be imported into :suppress: store_export.close() - import os os.remove('export.h5') In R this file can be read into a ``data.frame`` object using the ``rhdf5`` @@ -4471,7 +4559,7 @@ Performance * A ``PerformanceWarning`` will be raised if you are attempting to store types that will be pickled by PyTables (rather than stored as endemic types). See - `Here `__ + `Here `__ for more information and some solutions. @@ -4479,7 +4567,6 @@ Performance :suppress: store.close() - import os os.remove('store.h5') @@ -4545,7 +4632,6 @@ Read from a feather file. .. ipython:: python :suppress: - import os os.remove('example.feather') @@ -4579,7 +4665,7 @@ You can specify an ``engine`` to direct the serialization. This can be one of `` If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then ``pyarrow`` is tried, and falling back to ``fastparquet``. -See the documentation for `pyarrow `__ and `fastparquet `__. +See the documentation for `pyarrow `__ and `fastparquet `__. .. note:: @@ -4629,7 +4715,6 @@ Read only certain columns of a parquet file. .. ipython:: python :suppress: - import os os.remove('example_pa.parquet') os.remove('example_fp.parquet') @@ -4668,6 +4753,44 @@ Passing ``index=True`` will *always* write the index, even if that's not the underlying engine's default behavior. +Partitioning Parquet files +'''''''''''''''''''''''''' + +.. versionadded:: 0.24.0 + +Parquet supports partitioning of data based on the values of one or more columns. + +.. ipython:: python + + df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) + df.to_parquet(fname='test', engine='pyarrow', + partition_cols=['a'], compression=None) + +The `fname` specifies the parent directory to which data will be saved. +The `partition_cols` are the column names by which the dataset will be partitioned. +Columns are partitioned in the order they are given. The partition splits are +determined by the unique values in the partition columns. +The above example creates a partitioned dataset that may look like: + +.. code-block:: text + + test + ├── a=0 + │ ├── 0bac803e32dc42ae83fddfd029cbdebc.parquet + │ └── ... + └── a=1 + ├── e6ab24a4f45147b49b54a662f0c412a3.parquet + └── ... + +.. ipython:: python + :suppress: + + from shutil import rmtree + try: + rmtree('test') + except Exception: + pass + .. _io.sql: SQL Queries @@ -4681,13 +4804,13 @@ for PostgreSQL or `pymysql `__ for MySQL. For `SQLite `__ this is included in Python's standard library by default. You can find an overview of supported drivers for each SQL dialect in the -`SQLAlchemy docs `__. +`SQLAlchemy docs `__. If SQLAlchemy is not installed, a fallback is only provided for sqlite (and for mysql for backwards compatibility, but this is deprecated and will be removed in a future version). This mode requires a Python database adapter which respect the `Python -DB-API `__. +DB-API `__. See also some :ref:`cookbook examples ` for some advanced strategies. @@ -4709,7 +4832,7 @@ The key functions are: the provided input (database table name or sql query). Table names do not need to be quoted if they have special characters. -In the following example, we use the `SQlite `__ SQL database +In the following example, we use the `SQlite `__ SQL database engine. You can use a temporary SQLite database where data are stored in "memory". @@ -4717,7 +4840,7 @@ To connect with SQLAlchemy you use the :func:`create_engine` function to create object from database URI. You only need to create the engine once per database you are connecting to. For more information on :func:`create_engine` and the URI formatting, see the examples -below and the SQLAlchemy `documentation `__ +below and the SQLAlchemy `documentation `__ .. ipython:: python @@ -4754,14 +4877,15 @@ the database using :func:`~pandas.DataFrame.to_sql`. import datetime c = ['id', 'Date', 'Col_1', 'Col_2', 'Col_3'] - d = [(26, datetime.datetime(2010,10,18), 'X', 27.5, True), - (42, datetime.datetime(2010,10,19), 'Y', -12.5, False), - (63, datetime.datetime(2010,10,20), 'Z', 5.73, True)] + d = [(26, datetime.datetime(2010, 10, 18), 'X', 27.5, True), + (42, datetime.datetime(2010, 10, 19), 'Y', -12.5, False), + (63, datetime.datetime(2010, 10, 20), 'Z', 5.73, True)] - data = pd.DataFrame(d, columns=c) + data = pd.DataFrame(d, columns=c) .. ipython:: python + data data.to_sql('data', engine) With some databases, writing large DataFrames can result in errors due to @@ -4806,6 +4930,36 @@ default ``Text`` type for string columns: Because of this, reading the database table back in does **not** generate a categorical. +.. _io.sql_datetime_data: + +Datetime data types +''''''''''''''''''' + +Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing +datetime data that is timezone naive or timezone aware. However, the resulting +data stored in the database ultimately depends on the supported data type +for datetime data of the database system being used. + +The following table lists supported data types for datetime data for some +common databases. Other database dialects may have different data types for +datetime data. + +=========== ============================================= =================== +Database SQL Datetime Types Timezone Support +=========== ============================================= =================== +SQLite ``TEXT`` No +MySQL ``TIMESTAMP`` or ``DATETIME`` No +PostgreSQL ``TIMESTAMP`` or ``TIMESTAMP WITH TIME ZONE`` Yes +=========== ============================================= =================== + +When writing timezone aware data to databases that do not support timezones, +the data will be written as timezone naive timestamps that are in local time +with respect to the timezone. + +:func:`~pandas.read_sql_table` is also capable of reading datetime data that is +timezone aware or naive. When reading ``TIMESTAMP WITH TIME ZONE`` types, pandas +will convert the data to UTC. + Reading Tables '''''''''''''' @@ -4841,7 +4995,8 @@ to pass to :func:`pandas.to_datetime`: .. code-block:: python pd.read_sql_table('data', engine, parse_dates={'Date': '%Y-%m-%d'}) - pd.read_sql_table('data', engine, parse_dates={'Date': {'format': '%Y-%m-%d %H:%M:%S'}}) + pd.read_sql_table('data', engine, + parse_dates={'Date': {'format': '%Y-%m-%d %H:%M:%S'}}) You can check if a table exists using :func:`~pandas.io.sql.has_table` @@ -4887,7 +5042,8 @@ Specifying this will return an iterator through chunks of the query result: .. ipython:: python - for chunk in pd.read_sql_query("SELECT * FROM data_chunks", engine, chunksize=5): + for chunk in pd.read_sql_query("SELECT * FROM data_chunks", + engine, chunksize=5): print(chunk) You can also run a plain query without creating a ``DataFrame`` with @@ -4930,7 +5086,7 @@ connecting to. # or absolute, starting with a slash: engine = create_engine('sqlite:////absolute/path/to/foo.db') -For more information see the examples the SQLAlchemy `documentation `__ +For more information see the examples the SQLAlchemy `documentation `__ Advanced SQLAlchemy queries @@ -4952,14 +5108,14 @@ If you have an SQLAlchemy description of your database you can express where con metadata = sa.MetaData() data_table = sa.Table('data', metadata, - sa.Column('index', sa.Integer), - sa.Column('Date', sa.DateTime), - sa.Column('Col_1', sa.String), - sa.Column('Col_2', sa.Float), - sa.Column('Col_3', sa.Boolean), - ) + sa.Column('index', sa.Integer), + sa.Column('Date', sa.DateTime), + sa.Column('Col_1', sa.String), + sa.Column('Col_2', sa.Float), + sa.Column('Col_3', sa.Boolean), + ) - pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 == True), engine) + pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine) You can combine SQLAlchemy expressions with parameters passed to :func:`read_sql` using :func:`sqlalchemy.bindparam` @@ -4975,7 +5131,7 @@ Sqlite fallback The use of sqlite is supported without using SQLAlchemy. This mode requires a Python database adapter which respect the `Python -DB-API `__. +DB-API `__. You can create connections like so: @@ -4988,7 +5144,7 @@ And then issue the following queries: .. code-block:: python - data.to_sql('data', cnx) + data.to_sql('data', con) pd.read_sql_query("SELECT * FROM data", con) @@ -5127,7 +5283,6 @@ values will have ``object`` data type. .. ipython:: python :suppress: - import os os.remove('stata.dta') .. _io.stata-categorical: @@ -5206,6 +5361,9 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: .. code-block:: python + def do_something(chunk): + pass + rdr = pd.read_sas('sas_xport.xpt', chunk=100000) for chunk in rdr: do_something(chunk) @@ -5233,7 +5391,7 @@ xarray_ provides data structures inspired by the pandas ``DataFrame`` for workin with multi-dimensional datasets, with a focus on the netCDF file format and easy conversion to and from pandas. -.. _xarray: http://xarray.pydata.org/ +.. _xarray: https://xarray.pydata.org/ .. _io.perf: @@ -5258,6 +5416,94 @@ ignored. dtypes: float64(1), int64(1) memory usage: 15.3 MB +Given the next test set: + +.. code-block:: python + + from numpy.random import randn + + sz = 1000000 + df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) + + + def test_sql_write(df): + if os.path.exists('test.sql'): + os.remove('test.sql') + sql_db = sqlite3.connect('test.sql') + df.to_sql(name='test_table', con=sql_db) + sql_db.close() + + + def test_sql_read(): + sql_db = sqlite3.connect('test.sql') + pd.read_sql_query("select * from test_table", sql_db) + sql_db.close() + + + def test_hdf_fixed_write(df): + df.to_hdf('test_fixed.hdf', 'test', mode='w') + + + def test_hdf_fixed_read(): + pd.read_hdf('test_fixed.hdf', 'test') + + + def test_hdf_fixed_write_compress(df): + df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') + + + def test_hdf_fixed_read_compress(): + pd.read_hdf('test_fixed_compress.hdf', 'test') + + + def test_hdf_table_write(df): + df.to_hdf('test_table.hdf', 'test', mode='w', format='table') + + + def test_hdf_table_read(): + pd.read_hdf('test_table.hdf', 'test') + + + def test_hdf_table_write_compress(df): + df.to_hdf('test_table_compress.hdf', 'test', mode='w', + complib='blosc', format='table') + + + def test_hdf_table_read_compress(): + pd.read_hdf('test_table_compress.hdf', 'test') + + + def test_csv_write(df): + df.to_csv('test.csv', mode='w') + + + def test_csv_read(): + pd.read_csv('test.csv', index_col=0) + + + def test_feather_write(df): + df.to_feather('test.feather') + + + def test_feather_read(): + pd.read_feather('test.feather') + + + def test_pickle_write(df): + df.to_pickle('test.pkl') + + + def test_pickle_read(): + pd.read_pickle('test.pkl') + + + def test_pickle_write_compress(df): + df.to_pickle('test.pkl.compress', compression='xz') + + + def test_pickle_read_compress(): + pd.read_pickle('test.pkl.compress', compression='xz') + When writing, the top-three functions in terms of speed are are ``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``. @@ -5335,76 +5581,3 @@ Space on disk (in bytes) 16000248 Aug 21 18:00 test.feather 16000848 Aug 21 18:00 test.pkl 7554108 Aug 21 18:00 test.pkl.compress - -And here's the code: - -.. code-block:: python - - import os - import pandas as pd - import sqlite3 - from numpy.random import randn - from pandas.io import sql - - sz = 1000000 - df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) - - def test_sql_write(df): - if os.path.exists('test.sql'): - os.remove('test.sql') - sql_db = sqlite3.connect('test.sql') - df.to_sql(name='test_table', con=sql_db) - sql_db.close() - - def test_sql_read(): - sql_db = sqlite3.connect('test.sql') - pd.read_sql_query("select * from test_table", sql_db) - sql_db.close() - - def test_hdf_fixed_write(df): - df.to_hdf('test_fixed.hdf', 'test', mode='w') - - def test_hdf_fixed_read(): - pd.read_hdf('test_fixed.hdf', 'test') - - def test_hdf_fixed_write_compress(df): - df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') - - def test_hdf_fixed_read_compress(): - pd.read_hdf('test_fixed_compress.hdf', 'test') - - def test_hdf_table_write(df): - df.to_hdf('test_table.hdf', 'test', mode='w', format='table') - - def test_hdf_table_read(): - pd.read_hdf('test_table.hdf', 'test') - - def test_hdf_table_write_compress(df): - df.to_hdf('test_table_compress.hdf', 'test', mode='w', complib='blosc', format='table') - - def test_hdf_table_read_compress(): - pd.read_hdf('test_table_compress.hdf', 'test') - - def test_csv_write(df): - df.to_csv('test.csv', mode='w') - - def test_csv_read(): - pd.read_csv('test.csv', index_col=0) - - def test_feather_write(df): - df.to_feather('test.feather') - - def test_feather_read(): - pd.read_feather('test.feather') - - def test_pickle_write(df): - df.to_pickle('test.pkl') - - def test_pickle_read(): - pd.read_pickle('test.pkl') - - def test_pickle_write_compress(df): - df.to_pickle('test.pkl.compress', compression='xz') - - def test_pickle_read_compress(): - pd.read_pickle('test.pkl.compress', compression='xz') diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 98914c13d4d31..8a25d991c149b 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -31,10 +31,10 @@ operations. Concatenating objects --------------------- -The :func:`~pandas.concat` function (in the main pandas namespace) does all of -the heavy lifting of performing concatenation operations along an axis while -performing optional set logic (union or intersection) of the indexes (if any) on -the other axes. Note that I say "if any" because there is only a single possible +The :func:`~pandas.concat` function (in the main pandas namespace) does all of +the heavy lifting of performing concatenation operations along an axis while +performing optional set logic (union or intersection) of the indexes (if any) on +the other axes. Note that I say "if any" because there is only a single possible axis of concatenation for Series. Before diving into all of the details of ``concat`` and what it can do, here is @@ -109,9 +109,9 @@ some configurable handling of "what to do with the other axes": to the actual data concatenation. * ``copy`` : boolean, default True. If False, do not copy data unnecessarily. -Without a little bit of context many of these arguments don't make much sense. -Let's revisit the above example. Suppose we wanted to associate specific keys -with each of the pieces of the chopped up DataFrame. We can do this using the +Without a little bit of context many of these arguments don't make much sense. +Let's revisit the above example. Suppose we wanted to associate specific keys +with each of the pieces of the chopped up DataFrame. We can do this using the ``keys`` argument: .. ipython:: python @@ -138,9 +138,9 @@ It's not a stretch to see how this can be very useful. More detail on this functionality below. .. note:: - It is worth noting that :func:`~pandas.concat` (and therefore - :func:`~pandas.append`) makes a full copy of the data, and that constantly - reusing this function can create a significant performance hit. If you need + It is worth noting that :func:`~pandas.concat` (and therefore + :func:`~pandas.append`) makes a full copy of the data, and that constantly + reusing this function can create a significant performance hit. If you need to use the operation over several datasets, use a list comprehension. :: @@ -224,8 +224,8 @@ DataFrame: Concatenating using ``append`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A useful shortcut to :func:`~pandas.concat` are the :meth:`~DataFrame.append` -instance methods on ``Series`` and ``DataFrame``. These methods actually predated +A useful shortcut to :func:`~pandas.concat` are the :meth:`~DataFrame.append` +instance methods on ``Series`` and ``DataFrame``. These methods actually predated ``concat``. They concatenate along ``axis=0``, namely the index: .. ipython:: python @@ -271,8 +271,8 @@ need to be: .. note:: - Unlike the :py:meth:`~list.append` method, which appends to the original list - and returns ``None``, :meth:`~DataFrame.append` here **does not** modify + Unlike the :py:meth:`~list.append` method, which appends to the original list + and returns ``None``, :meth:`~DataFrame.append` here **does not** modify ``df1`` and returns its copy with ``df2`` appended. .. _merging.ignore_index: @@ -370,9 +370,9 @@ Passing ``ignore_index=True`` will drop all name references. More concatenating with group keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A fairly common use of the ``keys`` argument is to override the column names +A fairly common use of the ``keys`` argument is to override the column names when creating a new ``DataFrame`` based on existing ``Series``. -Notice how the default behaviour consists on letting the resulting ``DataFrame`` +Notice how the default behaviour consists on letting the resulting ``DataFrame`` inherit the parent ``Series``' name, when these existed. .. ipython:: python @@ -468,7 +468,7 @@ Appending rows to a DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While not especially efficient (since a new object must be created), you can -append a single row to a ``DataFrame`` by passing a ``Series`` or dict to +append a single row to a ``DataFrame`` by passing a ``Series`` or dict to ``append``, which returns a new ``DataFrame`` as above. .. ipython:: python @@ -513,7 +513,7 @@ pandas has full-featured, **high performance** in-memory join operations idiomatically very similar to relational databases like SQL. These methods perform significantly better (in some cases well over an order of magnitude better) than other open source implementations (like ``base::merge.data.frame`` -in R). The reason for this is careful algorithmic design and the internal layout +in R). The reason for this is careful algorithmic design and the internal layout of the data in ``DataFrame``. See the :ref:`cookbook` for some advanced strategies. @@ -521,7 +521,7 @@ See the :ref:`cookbook` for some advanced strategies. Users who are familiar with SQL but new to pandas might be interested in a :ref:`comparison with SQL`. -pandas provides a single function, :func:`~pandas.merge`, as the entry point for +pandas provides a single function, :func:`~pandas.merge`, as the entry point for all standard database join operations between ``DataFrame`` or named ``Series`` objects: :: @@ -590,7 +590,7 @@ The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` or and ``right`` is a subclass of ``DataFrame``, the return type will still be ``DataFrame``. ``merge`` is a function in the pandas namespace, and it is also available as a -``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling +``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling ``DataFrame`` being implicitly considered the left object in the join. The related :meth:`~DataFrame.join` method, uses ``merge`` internally for the @@ -602,7 +602,7 @@ Brief primer on merge methods (relational algebra) Experienced users of relational databases like SQL will be familiar with the terminology used to describe join operations between two SQL-table like -structures (``DataFrame`` objects). There are several cases to consider which +structures (``DataFrame`` objects). There are several cases to consider which are very important to understand: * **one-to-one** joins: for example when joining two ``DataFrame`` objects on @@ -642,8 +642,8 @@ key combination: labels=['left', 'right'], vertical=False); plt.close('all'); -Here is a more complicated example with multiple join keys. Only the keys -appearing in ``left`` and ``right`` are present (the intersection), since +Here is a more complicated example with multiple join keys. Only the keys +appearing in ``left`` and ``right`` are present (the intersection), since ``how='inner'`` by default. .. ipython:: python @@ -759,13 +759,13 @@ Checking for duplicate keys .. versionadded:: 0.21.0 -Users can use the ``validate`` argument to automatically check whether there -are unexpected duplicates in their merge keys. Key uniqueness is checked before -merge operations and so should protect against memory overflows. Checking key -uniqueness is also a good way to ensure user data structures are as expected. +Users can use the ``validate`` argument to automatically check whether there +are unexpected duplicates in their merge keys. Key uniqueness is checked before +merge operations and so should protect against memory overflows. Checking key +uniqueness is also a good way to ensure user data structures are as expected. -In the following example, there are duplicate values of ``B`` in the right -``DataFrame``. As this is not a one-to-one merge -- as specified in the +In the following example, there are duplicate values of ``B`` in the right +``DataFrame``. As this is not a one-to-one merge -- as specified in the ``validate`` argument -- an exception will be raised. @@ -778,11 +778,11 @@ In the following example, there are duplicate values of ``B`` in the right In [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one") ... - MergeError: Merge keys are not unique in right dataset; not a one-to-one merge + MergeError: Merge keys are not unique in right dataset; not a one-to-one merge -If the user is aware of the duplicates in the right ``DataFrame`` but wants to -ensure there are no duplicates in the left DataFrame, one can use the -``validate='one_to_many'`` argument instead, which will not raise an exception. +If the user is aware of the duplicates in the right ``DataFrame`` but wants to +ensure there are no duplicates in the left DataFrame, one can use the +``validate='one_to_many'`` argument instead, which will not raise an exception. .. ipython:: python @@ -794,8 +794,8 @@ ensure there are no duplicates in the left DataFrame, one can use the The merge indicator ~~~~~~~~~~~~~~~~~~~ -:func:`~pandas.merge` accepts the argument ``indicator``. If ``True``, a -Categorical-type column called ``_merge`` will be added to the output object +:func:`~pandas.merge` accepts the argument ``indicator``. If ``True``, a +Categorical-type column called ``_merge`` will be added to the output object that takes on values: =================================== ================ @@ -903,7 +903,7 @@ Joining on index ~~~~~~~~~~~~~~~~ :meth:`DataFrame.join` is a convenient method for combining the columns of two -potentially differently-indexed ``DataFrames`` into a single result +potentially differently-indexed ``DataFrames`` into a single result ``DataFrame``. Here is a very basic example: .. ipython:: python @@ -983,9 +983,9 @@ indexes: Joining key columns on an index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.join` takes an optional ``on`` argument which may be a column +:meth:`~DataFrame.join` takes an optional ``on`` argument which may be a column or multiple column names, which specifies that the passed ``DataFrame`` is to be -aligned on that column in the ``DataFrame``. These two function calls are +aligned on that column in the ``DataFrame``. These two function calls are completely equivalent: :: @@ -995,7 +995,7 @@ completely equivalent: how='left', sort=False) Obviously you can choose whichever form you find more convenient. For -many-to-one joins (where one of the ``DataFrame``'s is already indexed by the +many-to-one joins (where one of the ``DataFrame``'s is already indexed by the join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python @@ -1133,17 +1133,42 @@ This is equivalent but less verbose and more memory efficient / faster than this Joining with two MultiIndexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This is not implemented via ``join`` at-the-moment, however it can be done using -the following code. +This is supported in a limited way, provided that the index for the right +argument is completely used in the join, and is a subset of the indices in +the left argument, as in this example: .. ipython:: python - index = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) + leftindex = pd.MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], + names=['abc', 'xy', 'num']) + left = pd.DataFrame({'v1' : range(12)}, index=leftindex) + left + + rightindex = pd.MultiIndex.from_product([list('abc'), list('xy')], + names=['abc', 'xy']) + right = pd.DataFrame({'v2': [100*i for i in range(1, 7)]}, index=rightindex) + right + + left.join(right, on=['abc', 'xy'], how='inner') + +If that condition is not satisfied, a join with two multi-indexes can be +done using the following code. + +.. ipython:: python + + leftindex = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, - index=index) + index=leftindex) + + rightindex = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=rightindex) result = pd.merge(left.reset_index(), right.reset_index(), on=['key'], how='inner').set_index(['key','X','Y']) @@ -1161,7 +1186,7 @@ the following code. Merging on a combination of columns and index levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.22 +.. versionadded:: 0.23 Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters may refer to either column names or index level names. This enables merging @@ -1200,6 +1225,12 @@ resetting indexes. frames, the index level is preserved as an index level in the resulting DataFrame. +.. note:: + When DataFrames are merged using only some of the levels of a `MultiIndex`, + the extra levels will be dropped from the resulting merge. In order to + preserve those levels, use ``reset_index`` on those level names to move + those levels to columns prior to doing the merge. + .. note:: If a string matches both a column name and an index level name, then a @@ -1262,7 +1293,7 @@ similarly. Joining multiple DataFrame or Panel objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A list or tuple of ``DataFrames`` can also be passed to :meth:`~DataFrame.join` +A list or tuple of ``DataFrames`` can also be passed to :meth:`~DataFrame.join` to join them together on their indexes. .. ipython:: python @@ -1284,7 +1315,7 @@ Merging together values within Series or DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Another fairly common situation is to have two like-indexed (or similarly -indexed) ``Series`` or ``DataFrame`` objects and wanting to "patch" values in +indexed) ``Series`` or ``DataFrame`` objects and wanting to "patch" values in one object from values for matching indices in the other. Here is an example: .. ipython:: python @@ -1309,7 +1340,7 @@ For this, use the :meth:`~DataFrame.combine_first` method: plt.close('all'); Note that this method only takes values from the right ``DataFrame`` if they are -missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`, +missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`, alters non-NA values in place: .. ipython:: python @@ -1361,15 +1392,15 @@ Merging AsOf .. versionadded:: 0.19.0 -A :func:`merge_asof` is similar to an ordered left-join except that we match on -nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``, -we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less +A :func:`merge_asof` is similar to an ordered left-join except that we match on +nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``, +we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. -Optionally an asof merge can perform a group-wise merge. This matches the +Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, in addition to the nearest match on the ``on`` key. -For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` +For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` merge them. .. ipython:: python @@ -1428,8 +1459,8 @@ We only asof within ``2ms`` between the quote time and the trade time. by='ticker', tolerance=pd.Timedelta('2ms')) -We only asof within ``10ms`` between the quote time and the trade time and we -exclude exact matches on time. Note that though we exclude the exact matches +We only asof within ``10ms`` between the quote time and the trade time and we +exclude exact matches on time. Note that though we exclude the exact matches (of the quotes), prior quotes **do** propagate to that point in time. .. ipython:: python diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index e4b5578af15f0..7b6d338ee5b6a 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -678,7 +678,7 @@ Replacing more than one value is possible by passing a list. .. ipython:: python - df00 = df.values[0, 0] + df00 = df.iloc[0, 0] df.replace([1.5, df00], [np.nan, 'a']) df[1].dtype @@ -696,9 +696,8 @@ You can also operate on the DataFrame in place: .. code-block:: python - s = pd.Series([True, False, True]) - s.replace({'a string': 'new value', True: False}) # raises - + >>> s = pd.Series([True, False, True]) + >>> s.replace({'a string': 'new value', True: False}) # raises TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' will raise a ``TypeError`` because one of the ``dict`` keys is not of the @@ -728,7 +727,7 @@ rules introduced in the table below. :header: "data type", "Cast to" :widths: 40, 40 - integer, float + integer, float boolean, object float, no cast object, no cast diff --git a/doc/source/options.rst b/doc/source/options.rst index cbe0264f442bc..dc4d0da32008c 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -137,7 +137,7 @@ Using startup scripts for the python/ipython environment to import pandas and se $IPYTHONDIR/profile_default/startup More information can be found in the `ipython documentation -`__. An example startup script for pandas is displayed below: +`__. An example startup script for pandas is displayed below: .. code-block:: python diff --git a/doc/source/overview.rst b/doc/source/overview.rst index 6ba9501ba0b5e..b71f4bfa2f3be 100644 --- a/doc/source/overview.rst +++ b/doc/source/overview.rst @@ -82,7 +82,7 @@ Getting Support The first stop for pandas issues and ideas is the `Github Issue Tracker `__. If you have a general question, pandas community experts can answer through `Stack Overflow -`__. +`__. Community --------- @@ -92,7 +92,7 @@ the world who contribute their valuable time and energy to help make open source pandas possible. Thanks to `all of our contributors `__. If you're interested in contributing, please -visit `Contributing to pandas webpage `__. +visit `Contributing to pandas webpage `__. pandas is a `NumFOCUS `__ sponsored project. This will help ensure the success of development of pandas as a world-class open-source diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index 88634d7f75c63..f40f9199aaf66 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -33,10 +33,11 @@ See also the documentation of the `rpy2 `__ project: In the remainder of this page, a few examples of explicit conversion is given. The pandas conversion of rpy2 needs first to be activated: -.. ipython:: python +.. ipython:: + :verbatim: - from rpy2.robjects import r, pandas2ri - pandas2ri.activate() + In [1]: from rpy2.robjects import pandas2ri + ...: pandas2ri.activate() Transferring R data sets into Python ------------------------------------ @@ -44,10 +45,21 @@ Transferring R data sets into Python Once the pandas conversion is activated (``pandas2ri.activate()``), many conversions of R to pandas objects will be done automatically. For example, to obtain the 'iris' dataset as a pandas DataFrame: -.. ipython:: python +.. ipython:: + :verbatim: + + In [2]: from rpy2.robjects import r - r.data('iris') - r['iris'].head() + In [3]: r.data('iris') + + In [4]: r['iris'].head() + Out[4]: + Sepal.Length Sepal.Width Petal.Length Petal.Width Species + 0 5.1 3.5 1.4 0.2 setosa + 1 4.9 3.0 1.4 0.2 setosa + 2 4.7 3.2 1.3 0.2 setosa + 3 4.6 3.1 1.5 0.2 setosa + 4 5.0 3.6 1.4 0.2 setosa If the pandas conversion was not activated, the above could also be accomplished by explicitly converting it with the ``pandas2ri.ri2py`` function @@ -59,13 +71,24 @@ Converting DataFrames into R objects The ``pandas2ri.py2ri`` function support the reverse operation to convert DataFrames into the equivalent R object (that is, **data.frame**): -.. ipython:: python +.. ipython:: + :verbatim: + + In [5]: df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, + ...: index=["one", "two", "three"]) + + In [6]: r_dataframe = pandas2ri.py2ri(df) + + In [7]: print(type(r_dataframe)) + Out[7]: + + In [8]: print(r_dataframe) + Out[8]: + A B C + one 1 4 7 + two 2 5 8 + three 3 6 9 - df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, - index=["one", "two", "three"]) - r_dataframe = pandas2ri.py2ri(df) - print(type(r_dataframe)) - print(r_dataframe) The DataFrame's index is stored as the ``rownames`` attribute of the data.frame instance. diff --git a/doc/source/release.rst b/doc/source/release.rst index cd04288dce2c2..af6fc23e12b78 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -20,7 +20,7 @@ Release Notes ************* This is the list of changes to pandas between each release. For full details, -see the commit logs at http://github.com/pandas-dev/pandas +see the commit logs at https://github.com/pandas-dev/pandas **What is it** @@ -33,9 +33,9 @@ analysis / manipulation tool available in any language. **Where to get it** -* Source code: http://github.com/pandas-dev/pandas +* Source code: https://github.com/pandas-dev/pandas * Binary installers on PyPI: https://pypi.org/project/pandas -* Documentation: http://pandas.pydata.org +* Documentation: https://pandas.pydata.org pandas 0.23.2 ------------- @@ -586,7 +586,7 @@ Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. - New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying categoricals independent of the data, see :ref:`here `. -- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. +- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. - Compatibility fixes for pypy, see :ref:`here `. - Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. - Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). @@ -1171,7 +1171,7 @@ Highlights include: - Sparse data structures gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` - Comparison operations with ``Series`` no longer ignores the index, see :ref:`here ` for an overview of the API changes. - Introduction of a pandas development API for utility functions, see :ref:`here `. -- Deprecation of ``Panel4D`` and ``PanelND``. We recommend to represent these types of n-dimensional data with the `xarray package `__. +- Deprecation of ``Panel4D`` and ``PanelND``. We recommend to represent these types of n-dimensional data with the `xarray package `__. - Removal of the previously deprecated modules ``pandas.io.data``, ``pandas.io.wb``, ``pandas.tools.rplot``. See the :ref:`v0.19.0 Whatsnew ` overview for an extensive list @@ -1402,7 +1402,7 @@ Highlights include: - Removal of support for positional indexing with floats, which was deprecated since 0.14.0. This will now raise a ``TypeError``, see :ref:`here `. - The ``.to_xarray()`` function has been added for compatibility with the - `xarray package `__, see :ref:`here `. + `xarray package `__, see :ref:`here `. - The ``read_sas`` function has been enhanced to read ``sas7bdat`` files, see :ref:`here `. - Addition of the :ref:`.str.extractall() method `, and API changes to the :ref:`.str.extract() method ` @@ -1757,7 +1757,7 @@ along with several new features, enhancements, and performance improvements. Highlights include: - A new ``pipe`` method, see :ref:`here ` -- Documentation on how to use `numba `_ with *pandas*, see :ref:`here ` +- Documentation on how to use `numba `_ with *pandas*, see :ref:`here ` See the :ref:`v0.16.2 Whatsnew ` overview for an extensive list of all enhancements and bugs that have been fixed in 0.16.2. @@ -1889,9 +1889,9 @@ Highlights include: - Changes to the default for ordering in the ``Categorical`` constructor, see :ref:`here ` - The ``pandas.tools.rplot``, ``pandas.sandbox.qtpandas`` and ``pandas.rpy`` modules are deprecated. We refer users to external packages like - `seaborn `_, + `seaborn `_, `pandas-qt `_ and - `rpy2 `_ for similar or equivalent + `rpy2 `_ for similar or equivalent functionality, see :ref:`here ` See the :ref:`v0.16.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list diff --git a/doc/source/releases.rst b/doc/source/releases.rst new file mode 100644 index 0000000000000..0167903cce8bc --- /dev/null +++ b/doc/source/releases.rst @@ -0,0 +1,203 @@ +.. _release: + +************* +Release Notes +************* + +This is the list of changes to pandas between each release. For full details, +see the commit logs at http://github.com/pandas-dev/pandas. For install and +upgrade instructions, see :ref:`install`. + +Version 0.24 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.24.0 + +Version 0.23 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.23.4 + whatsnew/v0.23.3 + whatsnew/v0.23.2 + whatsnew/v0.23.1 + whatsnew/v0.23.0 + +Version 0.22 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.22.0 + +Version 0.21 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.21.0 + whatsnew/v0.21.1 + +Version 0.20 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.20.0 + whatsnew/v0.20.2 + whatsnew/v0.20.3 + +Version 0.19 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.19.0 + whatsnew/v0.19.1 + whatsnew/v0.19.2 + +Version 0.18 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.18.0 + whatsnew/v0.18.1 + +Version 0.17 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.17.0 + whatsnew/v0.17.1 + +Version 0.16 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.16.0 + whatsnew/v0.16.1 + whatsnew/v0.16.2 + +Version 0.15 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.15.0 + whatsnew/v0.15.1 + whatsnew/v0.15.2 + +Version 0.14 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.14.0 + whatsnew/v0.14.1 + +Version 0.13 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.13.0 + whatsnew/v0.13.1 + +Version 0.12 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.12.0 + +Version 0.11 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.11.0 + +Version 0.10 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.10.0 + whatsnew/v0.10.1 + +Version 0.9 +----------- + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.9.0 + whatsnew/v0.9.1 + +Version 0.8 +------------ + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.8.0 + whatsnew/v0.8.1 + +Version 0.7 +----------- + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.7.0 + whatsnew/v0.7.1 + whatsnew/v0.7.2 + whatsnew/v0.7.3 + +Version 0.6 +----------- + +.. toctree:: + :maxdepth: 2 + + + whatsnew/v0.6.0 + whatsnew/v0.6.1 + +Version 0.5 +----------- + +.. toctree:: + :maxdepth: 2 + + + whatsnew/v0.5.0 + +Version 0.4 +----------- + +.. toctree:: + :maxdepth: 2 + + whatsnew/v0.4.x diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 7d9925d800441..19857db1743e8 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -5,9 +5,10 @@ :suppress: import numpy as np - np.random.seed(123456) import pandas as pd - pd.options.display.max_rows=15 + + np.random.seed(123456) + pd.options.display.max_rows = 15 np.set_printoptions(precision=4, suppress=True) ************************** @@ -17,24 +18,25 @@ Reshaping and Pivot Tables Reshaping by pivoting DataFrame objects --------------------------------------- -.. ipython:: +.. image:: _static/reshaping_pivot.png + +.. ipython:: python :suppress: - In [1]: import pandas.util.testing as tm; tm.N = 3 + import pandas.util.testing as tm + tm.N = 3 - In [2]: def unpivot(frame): - ...: N, K = frame.shape - ...: data = {'value' : frame.values.ravel('F'), - ...: 'variable' : np.asarray(frame.columns).repeat(N), - ...: 'date' : np.tile(np.asarray(frame.index), K)} - ...: columns = ['date', 'variable', 'value'] - ...: return pd.DataFrame(data, columns=columns) - ...: + def unpivot(frame): + N, K = frame.shape + data = {'value': frame.to_numpy().ravel('F'), + 'variable': np.asarray(frame.columns).repeat(N), + 'date': np.tile(np.asarray(frame.index), K)} + columns = ['date', 'variable', 'value'] + return pd.DataFrame(data, columns=columns) - In [3]: df = unpivot(tm.makeTimeDataFrame()) + df = unpivot(tm.makeTimeDataFrame()) -Data is often stored in CSV files or databases in so-called "stacked" or -"record" format: +Data is often stored in so-called "stacked" or "record" format: .. ipython:: python @@ -45,13 +47,19 @@ For the curious here is how the above ``DataFrame`` was created: .. code-block:: python - import pandas.util.testing as tm; tm.N = 3 + import pandas.util.testing as tm + + tm.N = 3 + + def unpivot(frame): N, K = frame.shape - data = {'value' : frame.values.ravel('F'), - 'variable' : np.asarray(frame.columns).repeat(N), - 'date' : np.tile(np.asarray(frame.index), K)} + data = {'value': frame.to_numpy().ravel('F'), + 'variable': np.asarray(frame.columns).repeat(N), + 'date': np.tile(np.asarray(frame.index), K)} return pd.DataFrame(data, columns=['date', 'variable', 'value']) + + df = unpivot(tm.makeTimeDataFrame()) To select out everything for variable ``A`` we could do: @@ -60,8 +68,6 @@ To select out everything for variable ``A`` we could do: df[df['variable'] == 'A'] -.. image:: _static/reshaping_pivot.png - But suppose we wish to do time series operations with the variables. A better representation would be where the ``columns`` are the unique variables and an ``index`` of dates identifies individual observations. To reshape the data into @@ -81,7 +87,7 @@ column: .. ipython:: python df['value2'] = df['value'] * 2 - pivoted = df.pivot('date', 'variable') + pivoted = df.pivot(index='date', columns='variable') pivoted You can then select subsets from the pivoted ``DataFrame``: @@ -93,6 +99,12 @@ You can then select subsets from the pivoted ``DataFrame``: Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. +.. note:: + :func:`~pandas.pivot` will error with a ``ValueError: Index contains duplicate + entries, cannot reshape`` if the index/column pair is not unique. In this + case, consider using :func:`~pandas.pivot_table` which is a generalization + of pivot that can handle duplicate values for one index/column pair. + .. _reshaping.stacking: Reshaping by stacking and unstacking @@ -173,7 +185,7 @@ will result in a **sorted** copy of the original ``DataFrame`` or ``Series``: .. ipython:: python - index = pd.MultiIndex.from_product([[2,1], ['a', 'b']]) + index = pd.MultiIndex.from_product([[2, 1], ['a', 'b']]) df = pd.DataFrame(np.random.randn(4), index=index, columns=['A']) df all(df.unstack().stack() == df.sort_index()) @@ -193,9 +205,8 @@ processed individually. .. ipython:: python columns = pd.MultiIndex.from_tuples([ - ('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short') - ], + ('A', 'cat', 'long'), ('B', 'cat', 'long'), + ('A', 'dog', 'short'), ('B', 'dog', 'short')], names=['exp', 'animal', 'hair_length'] ) df = pd.DataFrame(np.random.randn(4, 4), columns=columns) @@ -290,10 +301,10 @@ For instance, .. ipython:: python - cheese = pd.DataFrame({'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + cheese = pd.DataFrame({'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) cheese cheese.melt(id_vars=['first', 'last']) cheese.melt(id_vars=['first', 'last'], var_name='quantity') @@ -304,11 +315,11 @@ user-friendly. .. ipython:: python - dft = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, - "A1980" : {0 : "d", 1 : "e", 2 : "f"}, - "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, - "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, - "X" : dict(zip(range(3), np.random.randn(3))) + dft = pd.DataFrame({"A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: .7}, + "B1980": {0: 3.2, 1: 1.3, 2: .1}, + "X": dict(zip(range(3), np.random.randn(3))) }) dft["id"] = dft.index dft @@ -379,7 +390,8 @@ We can produce pivot tables from this data very easily: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) pd.pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) - pd.pivot_table(df, values=['D','E'], index=['B'], columns=['A', 'C'], aggfunc=np.sum) + pd.pivot_table(df, values=['D', 'E'], index=['B'], columns=['A', 'C'], + aggfunc=np.sum) The result object is a ``DataFrame`` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -647,7 +659,7 @@ When a column contains only one level, it will be omitted in the result. .. ipython:: python - df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')}) + df = pd.DataFrame({'A': list('aaaaa'), 'B': list('ababc')}) pd.get_dummies(df) @@ -698,10 +710,103 @@ handling of NaN: In [3]: np.unique(x, return_inverse=True)[::-1] Out[3]: (array([3, 3, 0, 4, 1, 2]), array([nan, 3.14, inf, 'A', 'B'], dtype=object)) - .. note:: If you just want to handle one column as a categorical variable (like R's factor), you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. + +Examples +-------- + +In this section, we will review frequently asked questions and examples. The +column names and relevant column values are named to correspond with how this +DataFrame will be pivoted in the answers below. + +.. ipython:: python + + np.random.seed([3, 1415]) + n = 20 + + cols = np.array(['key', 'row', 'item', 'col']) + df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str)) + df.columns = cols + df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val')) + + df + +Pivoting with Single Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, +``row`` values are the index, and the mean of ``val0`` are the values? In +particular, the resulting DataFrame should look like: + +.. code-block:: ipython + + col col0 col1 col2 col3 col4 + row + row0 0.77 0.605 NaN 0.860 0.65 + row2 0.13 NaN 0.395 0.500 0.25 + row3 NaN 0.310 NaN 0.545 NaN + row4 NaN 0.100 0.395 0.760 0.24 + +This solution uses :func:`~pandas.pivot_table`. Also note that +``aggfunc='mean'`` is the default. It is included here to be explicit. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean') + +Note that we can also replace the missing values by using the ``fill_value`` +parameter. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean', fill_value=0) + +Also note that we can pass in other aggregation functions as well. For example, +we can also pass in ``sum``. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) + +Another aggregation we can do is calculate the frequency in which the columns +and rows occur together a.k.a. "cross tabulation". To do this, we can pass +``size`` to the ``aggfunc`` parameter. + +.. ipython:: python + + df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') + +Pivoting with Multiple Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We can also perform multiple aggregations. For example, to perform both a +``sum`` and ``mean``, we can pass in a list to the ``aggfunc`` argument. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) + +Note to aggregate over multiple value columns, we can pass in a list to the +``values`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) + +Note to subdivide over multiple columns we can pass in a list to the +``columns`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 2bb99dd1822b6..5a4a211a5e6b4 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -5,9 +5,9 @@ :suppress: import numpy as np - np.random.seed(123456) import pandas as pd - import pandas.util.testing as tm + + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 @@ -26,7 +26,7 @@ data structures have a ``to_sparse`` method: .. ipython:: python - ts = pd.Series(randn(10)) + ts = pd.Series(np.random.randn(10)) ts[2:-2] = np.nan sts = ts.to_sparse() sts @@ -44,7 +44,7 @@ large, mostly NA ``DataFrame``: .. ipython:: python - df = pd.DataFrame(randn(10000, 4)) + df = pd.DataFrame(np.random.randn(10000, 4)) df.iloc[:9998] = np.nan sdf = df.to_sparse() sdf @@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling sts.to_dense() +.. _sparse.accessor: + +Sparse Accessor +--------------- + +.. versionadded:: 0.24.0 + +Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` +for categorical data, and ``.dt`` for datetime-like data. This namespace provides +attributes and methods that are specific to sparse data. + +.. ipython:: python + + s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]") + s.sparse.density + s.sparse.fill_value + +This accessor is available only on data with ``SparseDtype``, and on the :class:`Series` +class itself for creating a Series with sparse data from a scipy COO matrix with. + .. _sparse.array: SparseArray @@ -74,7 +94,8 @@ distinct from the ``fill_value``: .. ipython:: python arr = np.random.randn(10) - arr[2:5] = np.nan; arr[7:8] = np.nan + arr[2:5] = np.nan + arr[7:8] = np.nan sparr = pd.SparseArray(arr) sparr @@ -224,7 +245,7 @@ The method requires a ``MultiIndex`` with two or more levels. (1, 1, 'b', 1), (2, 1, 'b', 0), (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) + names=['A', 'B', 'C', 'D']) s # SparseSeries diff --git a/doc/source/style.ipynb b/doc/source/style.ipynb index 6f66c1a9bf7f9..792fe5120f6e8 100644 --- a/doc/source/style.ipynb +++ b/doc/source/style.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "# Styling\n", "\n", @@ -51,7 +49,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "nbsphinx": "hidden" }, "outputs": [], @@ -64,9 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -132,9 +127,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def color_negative_red(val):\n", @@ -188,9 +181,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def highlight_max(s):\n", @@ -253,9 +244,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def highlight_max(data, color='yellow'):\n", @@ -908,9 +897,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from IPython.html import widgets\n", @@ -925,9 +912,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def magnify():\n", @@ -946,9 +931,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(25)\n", @@ -985,18 +968,16 @@ "- `vertical-align`\n", "- `white-space: nowrap`\n", "\n", - "Only CSS2 named colors and hex colors of the form `#rgb` or `#rrggbb` are currently supported.\n", "\n", - "The following pseudo CSS properties are also available to set excel specific style properties:\n", - "- `number-format`\n" + "- Only CSS2 named colors and hex colors of the form `#rgb` or `#rrggbb` are currently supported.\n", + "- The following pseudo CSS properties are also available to set excel specific style properties:\n", + " - `number-format`\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "df.style.\\\n", @@ -1037,9 +1018,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from jinja2 import Environment, ChoiceLoader, FileSystemLoader\n", @@ -1047,39 +1026,21 @@ "from pandas.io.formats.style import Styler" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%mkdir templates" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This next cell writes the custom template.\n", - "We extend the template `html.tpl`, which comes with pandas." + "We'll use the following template:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "%%file templates/myhtml.tpl\n", - "{% extends \"html.tpl\" %}\n", - "{% block table %}\n", - "

{{ table_title|default(\"My Table\") }}

\n", - "{{ super() }}\n", - "{% endblock table %}" + "with open(\"templates/myhtml.tpl\") as f:\n", + " print(f.read())" ] }, { @@ -1093,9 +1054,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "class MyStyler(Styler):\n", @@ -1122,9 +1081,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "MyStyler(df)" @@ -1140,9 +1097,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "HTML(MyStyler(df).render(table_title=\"Extending Example\"))" @@ -1158,9 +1113,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "EasyStyler = Styler.from_custom_template(\"templates\", \"myhtml.tpl\")\n", @@ -1177,9 +1130,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with open(\"template_structure.html\") as f:\n", @@ -1199,7 +1150,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "nbsphinx": "hidden" }, "outputs": [], @@ -1216,7 +1166,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [default]", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1230,14 +1180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 1, - "version_minor": 0 - } + "version": "3.7.0" } }, "nbformat": 4, diff --git a/doc/source/templates/myhtml.tpl b/doc/source/templates/myhtml.tpl new file mode 100644 index 0000000000000..1170fd3def653 --- /dev/null +++ b/doc/source/templates/myhtml.tpl @@ -0,0 +1,5 @@ +{% extends "html.tpl" %} +{% block table %} +

{{ table_title|default("My Table") }}

+{{ super() }} +{% endblock table %} diff --git a/doc/source/text.rst b/doc/source/text.rst index d01c48695d0d6..d677cc38c9888 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -6,10 +6,9 @@ import numpy as np import pandas as pd - randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) - from pandas.compat import lrange - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 ====================== Working with Text Data @@ -43,8 +42,8 @@ leading or trailing whitespace: .. ipython:: python - df = pd.DataFrame(randn(3, 2), columns=[' Column A ', ' Column B '], - index=range(3)) + df = pd.DataFrame(np.random.randn(3, 2), + columns=[' Column A ', ' Column B '], index=range(3)) df Since ``df.columns`` is an Index object, we can use the ``.str`` accessor @@ -169,12 +168,18 @@ positional argument (a regex object) and return a string. # Reverse every lowercase alphabetic word pat = r'[a-z]+' - repl = lambda m: m.group(0)[::-1] + + def repl(m): + return m.group(0)[::-1] + pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(pat, repl) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" - repl = lambda m: m.group('two').swapcase() + + def repl(m): + return m.group('two').swapcase() + pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) .. versionadded:: 0.20.0 @@ -216,7 +221,7 @@ The content of a ``Series`` (or ``Index``) can be concatenated: s = pd.Series(['a', 'b', 'c', 'd']) s.str.cat(sep=',') - + If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``: .. ipython:: python @@ -239,7 +244,7 @@ The first argument to :meth:`~Series.str.cat` can be a list-like object, provide .. ipython:: python s.str.cat(['A', 'B', 'C', 'D']) - + Missing values on either side will result in missing values in the result as well, *unless* ``na_rep`` is specified: .. ipython:: python @@ -260,7 +265,7 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o s d s.str.cat(d, na_rep='-') - + Concatenating a Series and an indexed object into a Series, with alignment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -312,8 +317,8 @@ All one-dimensional list-likes can be combined in a list-like container (includi s u - s.str.cat([u.values, - u.index.astype(str).values], na_rep='-') + s.str.cat([u.array, + u.index.astype(str).array], na_rep='-') All elements must match in length to the calling ``Series`` (or ``Index``), except those having an index if ``join`` is not None: @@ -375,7 +380,7 @@ DataFrame with one column per group. .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)', expand=False) + pd.Series(['a1', 'b2', 'c3']).str.extract(r'([ab])(\d)', expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -388,13 +393,14 @@ Named groups like .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)', expand=False) + pd.Series(['a1', 'b2', 'c3']).str.extract(r'(?P[ab])(?P\d)', + expand=False) and optional groups like .. ipython:: python - pd.Series(['a1', 'b2', '3']).str.extract('([ab])?(\d)', expand=False) + pd.Series(['a1', 'b2', '3']).str.extract(r'([ab])?(\d)', expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -405,13 +411,13 @@ with one column if ``expand=True``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)', expand=True) + pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=True) It returns a Series if ``expand=False``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)', expand=False) + pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=False) Calling on an ``Index`` with a regex with exactly one capture group returns a ``DataFrame`` with one column if ``expand=True``. diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index e602e45784f4a..8dab39aafbf67 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -4,18 +4,12 @@ .. ipython:: python :suppress: - import datetime import numpy as np import pandas as pd + np.random.seed(123456) - randn = np.random.randn - randint = np.random.randint np.set_printoptions(precision=4, suppress=True) - pd.options.display.max_rows=15 - import dateutil - import pytz - from dateutil.relativedelta import relativedelta - from pandas.tseries.offsets import * + pd.options.display.max_rows = 15 .. _timedeltas.timedeltas: @@ -37,6 +31,8 @@ You can construct a ``Timedelta`` scalar through various arguments: .. ipython:: python + import datetime + # strings pd.Timedelta('1 days') pd.Timedelta('1 days 00:00:00') @@ -74,13 +70,14 @@ You can construct a ``Timedelta`` scalar through various arguments: .. ipython:: python - pd.Timedelta(Second(2)) + pd.Timedelta(pd.offsets.Second(2)) Further, operations among the scalars yield another scalar ``Timedelta``. .. ipython:: python - pd.Timedelta(Day(2)) + pd.Timedelta(Second(2)) + pd.Timedelta('00:00:00.000123') + pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) +\ + pd.Timedelta('00:00:00.000123') to_timedelta ~~~~~~~~~~~~ @@ -135,8 +132,8 @@ subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. .. ipython:: python s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) - td = pd.Series([ pd.Timedelta(days=i) for i in range(3) ]) - df = pd.DataFrame(dict(A = s, B = td)) + td = pd.Series([pd.Timedelta(days=i) for i in range(3)]) + df = pd.DataFrame({'A': s, 'B': td}) df df['C'] = df['A'] + df['B'] df @@ -145,8 +142,8 @@ subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. s - s.max() s - datetime.datetime(2011, 1, 1, 3, 5) s + datetime.timedelta(minutes=5) - s + Minute(5) - s + Minute(5) + Milli(5) + s + pd.offsets.Minute(5) + s + pd.offsets.Minute(5) + pd.offsets.Milli(5) Operations with scalars from a ``timedelta64[ns]`` series: @@ -184,7 +181,7 @@ Operands can also appear in a reversed order (a singular object operated with a A = s - pd.Timestamp('20120101') - pd.Timedelta('00:05:05') B = s - pd.Series(pd.date_range('2012-1-2', periods=3, freq='D')) - df = pd.DataFrame(dict(A=A, B=B)) + df = pd.DataFrame({'A': A, 'B': B}) df df.min() @@ -232,7 +229,8 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob .. ipython:: python - y2 = pd.Series(pd.to_timedelta(['-1 days +00:00:05', 'nat', '-1 days +00:00:05', '1 days'])) + y2 = pd.Series(pd.to_timedelta(['-1 days +00:00:05', 'nat', + '-1 days +00:00:05', '1 days'])) y2 y2.mean() y2.median() @@ -250,8 +248,10 @@ Note that division by the NumPy scalar is true division, while astyping is equiv .. ipython:: python - td = pd.Series(pd.date_range('20130101', periods=4)) - \ - pd.Series(pd.date_range('20121201', periods=4)) + december = pd.Series(pd.date_range('20121201', periods=4)) + january = pd.Series(pd.date_range('20130101', periods=4)) + td = january - december + td[2] += datetime.timedelta(minutes=5, seconds=3) td[3] = np.nan td @@ -360,8 +360,8 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss .. ipython:: python - pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', - np.timedelta64(2,'D'), datetime.timedelta(days=2,seconds=2)]) + pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2, 'D'), + datetime.timedelta(days=2, seconds=2)]) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: @@ -458,7 +458,7 @@ Similarly to frequency conversion on a ``Series`` above, you can convert these i .. ipython:: python - tdi / np.timedelta64(1,'s') + tdi / np.timedelta64(1, 's') tdi.astype('timedelta64[s]') Scalars type ops work as well. These can potentially return a *different* type of index. diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 85b0abe421eb2..bca7b6a601dd2 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -4,18 +4,12 @@ .. ipython:: python :suppress: - from datetime import datetime, timedelta, time import numpy as np import pandas as pd - from pandas import offsets + np.random.seed(123456) - randn = np.random.randn - randint = np.random.randint np.set_printoptions(precision=4, suppress=True) - pd.options.display.max_rows=15 - import dateutil - import pytz - from dateutil.relativedelta import relativedelta + pd.options.display.max_rows = 15 ******************************** Time Series / Date functionality @@ -32,7 +26,10 @@ Parsing time series information from various sources and formats .. ipython:: python - dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime(2018, 1, 1)]) + import datetime + + dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), + datetime.datetime(2018, 1, 1)]) dti Generate sequences of fixed-frequency dates and time spans @@ -69,7 +66,7 @@ Performing date and time arithmetic with absolute or relative time increments saturday = friday + pd.Timedelta('1 day') saturday.day_name() # Add 1 business day (Friday --> Monday) - monday = friday + pd.tseries.offsets.BDay() + monday = friday + pd.offsets.BDay() monday.day_name() pandas provides a relatively compact and self-contained set of tools for @@ -110,12 +107,14 @@ However, :class:`Series` and :class:`DataFrame` can directly also support the ti pd.Series(pd.date_range('2000', freq='D', periods=3)) -:class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime`` and ``timedelta`` -data when the time data is used as data itself. The ``Period`` and ``DateOffset`` data will be stored as ``object`` data. +:class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime``, ``timedelta`` +and ``Period`` data when passed into those constructors. ``DateOffset`` +data however will be stored as ``object`` data. .. ipython:: python pd.Series(pd.period_range('1/1/2011', freq='M', periods=3)) + pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) pd.Series(pd.date_range('1/1/2011', freq='M', periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which @@ -141,7 +140,7 @@ time. .. ipython:: python - pd.Timestamp(datetime(2012, 5, 1)) + pd.Timestamp(datetime.datetime(2012, 5, 1)) pd.Timestamp('2012-05-01') pd.Timestamp(2012, 5, 1) @@ -163,7 +162,9 @@ and :class:`PeriodIndex` respectively. .. ipython:: python - dates = [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'), pd.Timestamp('2012-05-03')] + dates = [pd.Timestamp('2012-05-01'), + pd.Timestamp('2012-05-02'), + pd.Timestamp('2012-05-03')] ts = pd.Series(np.random.randn(3), dates) type(ts.index) @@ -327,7 +328,7 @@ which can be specified. These are computed from the starting point specified by 1349979305, 1350065705], unit='s') pd.to_datetime([1349720105100, 1349720105200, 1349720105300, - 1349720105400, 1349720105500 ], unit='ms') + 1349720105400, 1349720105500], unit='ms') .. note:: @@ -400,7 +401,9 @@ To generate an index with timestamps, you can use either the ``DatetimeIndex`` o .. ipython:: python - dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + dates = [datetime.datetime(2012, 5, 1), + datetime.datetime(2012, 5, 2), + datetime.datetime(2012, 5, 3)] # Note the frequency information index = pd.DatetimeIndex(dates) @@ -418,8 +421,8 @@ to create a ``DatetimeIndex``. The default frequency for ``date_range`` is a .. ipython:: python - start = datetime(2011, 1, 1) - end = datetime(2012, 1, 1) + start = datetime.datetime(2011, 1, 1) + end = datetime.datetime(2012, 1, 1) index = pd.date_range(start, end) index @@ -486,7 +489,7 @@ used if a custom frequency string is passed. weekmask = 'Mon Wed Fri' - holidays = [datetime(2011, 1, 5), datetime(2011, 3, 14)] + holidays = [datetime.datetime(2011, 1, 5), datetime.datetime(2011, 3, 14)] pd.bdate_range(start, end, freq='C', weekmask=weekmask, holidays=holidays) @@ -564,7 +567,7 @@ Dates and strings that parse to timestamps can be passed as indexing parameters: ts['1/31/2011'] - ts[datetime(2011, 12, 25):] + ts[datetime.datetime(2011, 12, 25):] ts['10/31/2011':'12/31/2011'] @@ -583,9 +586,8 @@ would include matching times on an included date: .. ipython:: python - dft = pd.DataFrame(randn(100000,1), - columns=['A'], - index=pd.date_range('20130101',periods=100000,freq='T')) + dft = pd.DataFrame(np.random.randn(100000, 1), columns=['A'], + index=pd.date_range('20130101', periods=100000, freq='T')) dft dft['2013'] @@ -622,10 +624,9 @@ We are stopping on the included end-point as it is part of the index: dft2 = pd.DataFrame(np.random.randn(20, 1), columns=['A'], - index=pd.MultiIndex.from_product([pd.date_range('20130101', - periods=10, - freq='12H'), - ['a', 'b']])) + index=pd.MultiIndex.from_product( + [pd.date_range('20130101', periods=10, freq='12H'), + ['a', 'b']])) dft2 dft2.loc['2013-01-05'] idx = pd.IndexSlice @@ -681,7 +682,7 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, - index=series_minute.index) + index=series_minute.index) dft_minute['2011-12-31 23'] @@ -693,18 +694,16 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python - dft_minute.loc['2011-12-31 23:59'] + dft_minute.loc['2011-12-31 23:59'] Note also that ``DatetimeIndex`` resolution cannot be less precise than day. .. ipython:: python series_monthly = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12', - '2012-01', - '2012-02'])) + pd.DatetimeIndex(['2011-12', '2012-01', '2012-02'])) series_monthly.index.resolution - series_monthly['2011-12'] # returns Series + series_monthly['2011-12'] # returns Series Exact Indexing @@ -716,13 +715,14 @@ These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and .. ipython:: python - dft[datetime(2013, 1, 1):datetime(2013,2,28)] + dft[datetime.datetime(2013, 1, 1):datetime.datetime(2013, 2, 28)] With no defaults. .. ipython:: python - dft[datetime(2013, 1, 1, 10, 12, 0):datetime(2013, 2, 28, 10, 12, 0)] + dft[datetime.datetime(2013, 1, 1, 10, 12, 0): + datetime.datetime(2013, 2, 28, 10, 12, 0)] Truncating & Fancy Indexing @@ -823,120 +823,119 @@ on :ref:`.dt accessors`. DateOffset Objects ------------------ -In the preceding examples, we created ``DatetimeIndex`` objects at various -frequencies by passing in :ref:`frequency strings ` -like 'M', 'W', and 'BM' to the ``freq`` keyword. Under the hood, these frequency -strings are being translated into an instance of :class:`DateOffset`, -which represents a regular frequency increment. Specific offset logic like -"month", "business day", or "one hour" is represented in its various subclasses. +In the preceding examples, frequency strings (e.g. ``'D'``) were used to specify +a frequency that defined: -.. csv-table:: - :header: "Class name", "Description" - :widths: 15, 65 +* how the date times in :class:`DatetimeIndex` were spaced when using :meth:`date_range` +* the frequency of a :class:`Period` or :class:`PeriodIndex` - DateOffset, "Generic offset class, defaults to 1 calendar day" - BDay, "business day (weekday)" - CDay, "custom business day" - Week, "one week, optionally anchored on a day of the week" - WeekOfMonth, "the x-th day of the y-th week of each month" - LastWeekOfMonth, "the x-th day of the last week of each month" - MonthEnd, "calendar month end" - MonthBegin, "calendar month begin" - BMonthEnd, "business month end" - BMonthBegin, "business month begin" - CBMonthEnd, "custom business month end" - CBMonthBegin, "custom business month begin" - SemiMonthEnd, "15th (or other day_of_month) and calendar month end" - SemiMonthBegin, "15th (or other day_of_month) and calendar month begin" - QuarterEnd, "calendar quarter end" - QuarterBegin, "calendar quarter begin" - BQuarterEnd, "business quarter end" - BQuarterBegin, "business quarter begin" - FY5253Quarter, "retail (aka 52-53 week) quarter" - YearEnd, "calendar year end" - YearBegin, "calendar year begin" - BYearEnd, "business year end" - BYearBegin, "business year begin" - FY5253, "retail (aka 52-53 week) year" - BusinessHour, "business hour" - CustomBusinessHour, "custom business hour" - Hour, "one hour" - Minute, "one minute" - Second, "one second" - Milli, "one millisecond" - Micro, "one microsecond" - Nano, "one nanosecond" +These frequency strings map to a :class:`DateOffset` object and its subclasses. A :class:`DateOffset` +is similar to a :class:`Timedelta` that represents a duration of time but follows specific calendar duration rules. +For example, a :class:`Timedelta` day will always increment ``datetimes`` by 24 hours, while a :class:`DateOffset` day +will increment ``datetimes`` to the same time the next day whether a day represents 23, 24 or 25 hours due to daylight +savings time. However, all :class:`DateOffset` subclasses that are an hour or smaller +(``Hour``, ``Minute``, ``Second``, ``Milli``, ``Micro``, ``Nano``) behave like +:class:`Timedelta` and respect absolute time. -The basic ``DateOffset`` takes the same arguments as -``dateutil.relativedelta``, which works as follows: +The basic :class:`DateOffset` acts similar to ``dateutil.relativedelta`` (`relativedelta documentation`_) +that shifts a date time by the corresponding calendar duration specified. The +arithmetic operator (``+``) or the ``apply`` method can be used to perform the shift. .. ipython:: python - d = datetime(2008, 8, 18, 9, 0) - d + relativedelta(months=4, days=5) - -We could have done the same thing with ``DateOffset``: - -.. ipython:: python - - from pandas.tseries.offsets import * - d + DateOffset(months=4, days=5) - -The key features of a ``DateOffset`` object are: - -* It can be added / subtracted to/from a datetime object to obtain a - shifted date. -* It can be multiplied by an integer (positive or negative) so that the - increment will be applied multiple times. -* It has :meth:`~pandas.DateOffset.rollforward` and - :meth:`~pandas.DateOffset.rollback` methods for moving a date forward or - backward to the next or previous "offset date". - -Subclasses of ``DateOffset`` define the ``apply`` function which dictates -custom date increment logic, such as adding business days: - -.. code-block:: python - - class BDay(DateOffset): - """DateOffset increments between business days""" - def apply(self, other): - ... - -.. ipython:: python - - d - 5 * BDay() - d + BMonthEnd() - -The ``rollforward`` and ``rollback`` methods do exactly what you would expect: - -.. ipython:: python - - d - offset = BMonthEnd() - offset.rollforward(d) - offset.rollback(d) - -It's definitely worth exploring the ``pandas.tseries.offsets`` module and the -various docstrings for the classes. + # This particular day contains a day light savings time transition + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + # Respects absolute time + ts + pd.Timedelta(days=1) + # Respects calendar time + ts + pd.DateOffset(days=1) + friday = pd.Timestamp('2018-01-05') + friday.day_name() + # Add 2 business days (Friday --> Tuesday) + two_business_days = 2 * pd.offsets.BDay() + two_business_days.apply(friday) + friday + two_business_days + (friday + two_business_days).day_name() + +Most ``DateOffsets`` have associated frequencies strings, or offset aliases, that can be passed +into ``freq`` keyword arguments. The available date offsets and associated frequency strings can be found below: -These operations (``apply``, ``rollforward`` and ``rollback``) preserve time -(hour, minute, etc) information by default. To reset time, use ``normalize`` -before or after applying the operation (depending on whether you want the -time information included in the operation. +.. csv-table:: + :header: "Date Offset", "Frequency String", "Description" + :widths: 15, 15, 65 + + ``DateOffset``, None, "Generic offset class, defaults to 1 calendar day" + ``BDay`` or ``BusinessDay``, ``'B'``,"business day (weekday)" + ``CDay`` or ``CustomBusinessDay``, ``'C'``, "custom business day" + ``Week``, ``'W'``, "one week, optionally anchored on a day of the week" + ``WeekOfMonth``, ``'WOM'``, "the x-th day of the y-th week of each month" + ``LastWeekOfMonth``, ``'LWOM'``, "the x-th day of the last week of each month" + ``MonthEnd``, ``'M'``, "calendar month end" + ``MonthBegin``, ``'MS'``, "calendar month begin" + ``BMonthEnd`` or ``BusinessMonthEnd``, ``'BM'``, "business month end" + ``BMonthBegin`` or ``BusinessMonthBegin``, ``'BMS'``, "business month begin" + ``CBMonthEnd`` or ``CustomBusinessMonthEnd``, ``'CBM'``, "custom business month end" + ``CBMonthBegin`` or ``CustomBusinessMonthBegin``, ``'CBMS'``, "custom business month begin" + ``SemiMonthEnd``, ``'SM'``, "15th (or other day_of_month) and calendar month end" + ``SemiMonthBegin``, ``'SMS'``, "15th (or other day_of_month) and calendar month begin" + ``QuarterEnd``, ``'Q'``, "calendar quarter end" + ``QuarterBegin``, ``'QS'``, "calendar quarter begin" + ``BQuarterEnd``, ``'BQ``, "business quarter end" + ``BQuarterBegin``, ``'BQS'``, "business quarter begin" + ``FY5253Quarter``, ``'REQ'``, "retail (aka 52-53 week) quarter" + ``YearEnd``, ``'A'``, "calendar year end" + ``YearBegin``, ``'AS'`` or ``'BYS'``,"calendar year begin" + ``BYearEnd``, ``'BA'``, "business year end" + ``BYearBegin``, ``'BAS'``, "business year begin" + ``FY5253``, ``'RE'``, "retail (aka 52-53 week) year" + ``Easter``, None, "Easter holiday" + ``BusinessHour``, ``'BH'``, "business hour" + ``CustomBusinessHour``, ``'CBH'``, "custom business hour" + ``Day``, ``'D'``, "one absolute day" + ``Hour``, ``'H'``, "one hour" + ``Minute``, ``'T'`` or ``'min'``,"one minute" + ``Second``, ``'S'``, "one second" + ``Milli``, ``'L'`` or ``'ms'``, "one millisecond" + ``Micro``, ``'U'`` or ``'us'``, "one microsecond" + ``Nano``, ``'N'``, "one nanosecond" + +``DateOffsets`` additionally have :meth:`rollforward` and :meth:`rollback` +methods for moving a date forward or backward respectively to a valid offset +date relative to the offset. For example, business offsets will roll dates +that land on the weekends (Saturday and Sunday) forward to Monday since +business offsets operate on the weekdays. + +.. ipython:: python + + ts = pd.Timestamp('2018-01-06 00:00:00') + ts.day_name() + # BusinessHour's valid offset dates are Monday through Friday + offset = pd.offsets.BusinessHour(start='09:00') + # Bring the date to the closest offset date (Monday) + offset.rollforward(ts) + # Date is brought to the closest offset date first and then the hour is added + ts + offset + +These operations preserve time (hour, minute, etc) information by default. +To reset time to midnight, use :meth:`normalize` before or after applying +the operation (depending on whether you want the time information included +in the operation). .. ipython:: python ts = pd.Timestamp('2014-01-01 09:00') - day = Day() + day = pd.offsets.Day() day.apply(ts) day.apply(ts).normalize() ts = pd.Timestamp('2014-01-01 22:00') - hour = Hour() + hour = pd.offsets.Hour() hour.apply(ts) hour.apply(ts).normalize() hour.apply(pd.Timestamp("2014-01-01 23:30")).normalize() +.. _relativedelta documentation: https://dateutil.readthedocs.io/en/stable/relativedelta.html + .. _timeseries.dayvscalendarday: Day vs. CalendarDay @@ -968,27 +967,28 @@ particular day of the week: .. ipython:: python + d = datetime.datetime(2008, 8, 18, 9, 0) d - d + Week() - d + Week(weekday=4) - (d + Week(weekday=4)).weekday() + d + pd.offsets.Week() + d + pd.offsets.Week(weekday=4) + (d + pd.offsets.Week(weekday=4)).weekday() - d - Week() + d - pd.offsets.Week() The ``normalize`` option will be effective for addition and subtraction. .. ipython:: python - d + Week(normalize=True) - d - Week(normalize=True) + d + pd.offsets.Week(normalize=True) + d - pd.offsets.Week(normalize=True) Another example is parameterizing ``YearEnd`` with the specific ending month: .. ipython:: python - d + YearEnd() - d + YearEnd(month=6) + d + pd.offsets.YearEnd() + d + pd.offsets.YearEnd(month=6) .. _timeseries.offsetseries: @@ -1004,9 +1004,9 @@ apply the offset to each element. rng = pd.date_range('2012-01-01', '2012-01-03') s = pd.Series(rng) rng - rng + DateOffset(months=2) - s + DateOffset(months=2) - s - DateOffset(months=2) + rng + pd.DateOffset(months=2) + s + pd.DateOffset(months=2) + s - pd.DateOffset(months=2) If the offset class maps directly to a ``Timedelta`` (``Day``, ``Hour``, ``Minute``, ``Second``, ``Micro``, ``Milli``, ``Nano``) it can be @@ -1015,10 +1015,10 @@ used exactly like a ``Timedelta`` - see the .. ipython:: python - s - Day(2) + s - pd.offsets.Day(2) td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31')) td - td + Minute(15) + td + pd.offsets.Minute(15) Note that some offsets (such as ``BQuarterEnd``) do not have a vectorized implementation. They can still be used but may @@ -1027,7 +1027,7 @@ calculate significantly slower and will show a ``PerformanceWarning`` .. ipython:: python :okwarning: - rng + BQuarterEnd() + rng + pd.offsets.BQuarterEnd() .. _timeseries.custombusinessdays: @@ -1043,15 +1043,17 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i .. ipython:: python - from pandas.tseries.offsets import CustomBusinessDay weekmask_egypt = 'Sun Mon Tue Wed Thu' # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] - bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) - dt = datetime(2013, 4, 30) + holidays = ['2012-05-01', + datetime.datetime(2013, 5, 1), + np.datetime64('2014-05-01')] + bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, + weekmask=weekmask_egypt) + dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt Let's map to the weekday names: @@ -1060,7 +1062,8 @@ Let's map to the weekday names: dts = pd.date_range(dt, periods=5, freq=bday_egypt) - pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) + pd.Series(dts.weekday, dts).map( + pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) Holiday calendars can be used to provide the list of holidays. See the :ref:`holiday calendar` section for more information. @@ -1069,10 +1072,10 @@ Holiday calendars can be used to provide the list of holidays. See the from pandas.tseries.holiday import USFederalHolidayCalendar - bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar()) + bday_us = pd.offsets.CustomBusinessDay(calendar=USFederalHolidayCalendar()) # Friday before MLK Day - dt = datetime(2014, 1, 17) + dt = datetime.datetime(2014, 1, 17) # Tuesday after MLK Day (Monday is skipped because it's a holiday) dt + bday_us @@ -1082,15 +1085,15 @@ in the usual way. .. ipython:: python - from pandas.tseries.offsets import CustomBusinessMonthBegin - bmth_us = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar()) + bmth_us = pd.offsets.CustomBusinessMonthBegin( + calendar=USFederalHolidayCalendar()) # Skip new years - dt = datetime(2013, 12, 17) + dt = datetime.datetime(2013, 12, 17) dt + bmth_us # Define date index with custom offset - pd.DatetimeIndex(start='20100101',end='20120101',freq=bmth_us) + pd.DatetimeIndex(start='20100101', end='20120101', freq=bmth_us) .. note:: @@ -1111,13 +1114,13 @@ allowing to use specific start and end times. By default, ``BusinessHour`` uses 9:00 - 17:00 as business hours. Adding ``BusinessHour`` will increment ``Timestamp`` by hourly frequency. -If target ``Timestamp`` is out of business hours, move to the next business hour -then increment it. If the result exceeds the business hours end, the remaining +If target ``Timestamp`` is out of business hours, move to the next business hour +then increment it. If the result exceeds the business hours end, the remaining hours are added to the next business day. .. ipython:: python - bh = BusinessHour() + bh = pd.offsets.BusinessHour() bh # 2014-08-01 is Friday @@ -1134,19 +1137,19 @@ hours are added to the next business day. pd.Timestamp('2014-08-01 16:30') + bh # Adding 2 business hours - pd.Timestamp('2014-08-01 10:00') + BusinessHour(2) + pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(2) # Subtracting 3 business hours - pd.Timestamp('2014-08-01 10:00') + BusinessHour(-3) + pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(-3) -You can also specify ``start`` and ``end`` time by keywords. The argument must -be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` -instance. Specifying seconds, microseconds and nanoseconds as business hour +You can also specify ``start`` and ``end`` time by keywords. The argument must +be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` +instance. Specifying seconds, microseconds and nanoseconds as business hour results in ``ValueError``. .. ipython:: python - bh = BusinessHour(start='11:00', end=time(20, 0)) + bh = pd.offsets.BusinessHour(start='11:00', end=datetime.time(20, 0)) bh pd.Timestamp('2014-08-01 13:00') + bh @@ -1159,7 +1162,7 @@ Valid business hours are distinguished by whether it started from valid ``Busine .. ipython:: python - bh = BusinessHour(start='17:00', end='09:00') + bh = pd.offsets.BusinessHour(start='17:00', end='09:00') bh pd.Timestamp('2014-08-01 17:00') + bh @@ -1184,22 +1187,22 @@ under the default business hours (9:00 - 17:00), there is no gap (0 minutes) bet .. ipython:: python # This adjusts a Timestamp to business hour edge - BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) - BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) # It is the same as BusinessHour().apply(pd.Timestamp('2014-08-01 17:00')). # And it is the same as BusinessHour().apply(pd.Timestamp('2014-08-04 09:00')) - BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) # BusinessDay results (for reference) - BusinessHour().rollforward(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02')) # It is the same as BusinessDay().apply(pd.Timestamp('2014-08-01')) # The result is the same as rollworward because BusinessDay never overlap. - BusinessHour().apply(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02')) -``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary -holidays, you can use ``CustomBusinessHour`` offset, as explained in the +``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary +holidays, you can use ``CustomBusinessHour`` offset, as explained in the following subsection. .. _timeseries.custombusinesshour: @@ -1216,9 +1219,9 @@ as ``BusinessHour`` except that it skips specified custom holidays. .. ipython:: python from pandas.tseries.holiday import USFederalHolidayCalendar - bhour_us = CustomBusinessHour(calendar=USFederalHolidayCalendar()) + bhour_us = pd.offsets.CustomBusinessHour(calendar=USFederalHolidayCalendar()) # Friday before MLK Day - dt = datetime(2014, 1, 17, 15) + dt = datetime.datetime(2014, 1, 17, 15) dt + bhour_us @@ -1229,7 +1232,8 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB .. ipython:: python - bhour_mon = CustomBusinessHour(start='10:00', weekmask='Tue Wed Thu Fri') + bhour_mon = pd.offsets.CustomBusinessHour(start='10:00', + weekmask='Tue Wed Thu Fri') # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 @@ -1285,7 +1289,7 @@ most functions: pd.date_range(start, periods=5, freq='B') - pd.date_range(start, periods=5, freq=BDay()) + pd.date_range(start, periods=5, freq=pd.offsets.BDay()) You can combine together day and intraday offsets: @@ -1352,39 +1356,39 @@ anchor point, and moved ``|n|-1`` additional steps forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-02') + MonthBegin(n=1) - pd.Timestamp('2014-01-02') + MonthEnd(n=1) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') - MonthBegin(n=1) - pd.Timestamp('2014-01-02') - MonthEnd(n=1) + pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-02') - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') + MonthBegin(n=4) - pd.Timestamp('2014-01-02') - MonthBegin(n=4) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=4) + pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=4) If the given date *is* on an anchor point, it is moved ``|n|`` points forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-01') + MonthBegin(n=1) - pd.Timestamp('2014-01-31') + MonthEnd(n=1) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') - MonthBegin(n=1) - pd.Timestamp('2014-01-31') - MonthEnd(n=1) + pd.Timestamp('2014-01-01') - pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-31') - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') + MonthBegin(n=4) - pd.Timestamp('2014-01-31') - MonthBegin(n=4) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=4) + pd.Timestamp('2014-01-31') - pd.offsets.MonthBegin(n=4) For the case when ``n=0``, the date is not moved if on an anchor point, otherwise it is rolled forward to the next anchor point. .. ipython:: python - pd.Timestamp('2014-01-02') + MonthBegin(n=0) - pd.Timestamp('2014-01-02') + MonthEnd(n=0) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=0) + pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=0) - pd.Timestamp('2014-01-01') + MonthBegin(n=0) - pd.Timestamp('2014-01-31') + MonthEnd(n=0) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=0) + pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=0) .. _timeseries.holiday: @@ -1427,10 +1431,13 @@ An example of how holidays and holiday calendars are defined: USMemorialDay, Holiday('July 4th', month=7, day=4, observance=nearest_workday), Holiday('Columbus Day', month=10, day=1, - offset=DateOffset(weekday=MO(2))), #same as 2*Week(weekday=2) - ] + offset=pd.DateOffset(weekday=MO(2)))] + cal = ExampleCalendar() - cal.holidays(datetime(2012, 1, 1), datetime(2012, 12, 31)) + cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31)) + +:hint: + **weekday=MO(2)** is same as **2 * Week(weekday=2)** Using this calendar, creating an index or doing offset arithmetic skips weekends and holidays (i.e., Memorial Day/July 4th). For example, the below defines @@ -1440,14 +1447,13 @@ or ``Timestamp`` objects. .. ipython:: python - from pandas.tseries.offsets import CDay pd.DatetimeIndex(start='7/1/2012', end='7/10/2012', - freq=CDay(calendar=cal)).to_pydatetime() - offset = CustomBusinessDay(calendar=cal) - datetime(2012, 5, 25) + offset - datetime(2012, 7, 3) + offset - datetime(2012, 7, 3) + 2 * offset - datetime(2012, 7, 6) + offset + freq=pd.offsets.CDay(calendar=cal)).to_pydatetime() + offset = pd.offsets.CustomBusinessDay(calendar=cal) + datetime.datetime(2012, 5, 25) + offset + datetime.datetime(2012, 7, 3) + offset + datetime.datetime(2012, 7, 3) + 2 * offset + datetime.datetime(2012, 7, 6) + offset Ranges are defined by the ``start_date`` and ``end_date`` class attributes of ``AbstractHolidayCalendar``. The defaults are shown below. @@ -1462,8 +1468,8 @@ datetime/Timestamp/string. .. ipython:: python - AbstractHolidayCalendar.start_date = datetime(2012, 1, 1) - AbstractHolidayCalendar.end_date = datetime(2012, 12, 31) + AbstractHolidayCalendar.start_date = datetime.datetime(2012, 1, 1) + AbstractHolidayCalendar.end_date = datetime.datetime(2012, 12, 31) cal.holidays() Every calendar class is accessible by name using the ``get_calendar`` function @@ -1490,7 +1496,7 @@ Shifting / Lagging ~~~~~~~~~~~~~~~~~~ One may want to *shift* or *lag* the values in a time series back and forward in -time. The method for this is :meth:`~Series.shift`, which is available on all of +time. The method for this is :meth:`~Series.shift`, which is available on all of the pandas objects. .. ipython:: python @@ -1500,16 +1506,16 @@ the pandas objects. ts.shift(1) The ``shift`` method accepts an ``freq`` argument which can accept a -``DateOffset`` class or other ``timedelta``-like object or also an +``DateOffset`` class or other ``timedelta``-like object or also an :ref:`offset alias `: .. ipython:: python - ts.shift(5, freq=offsets.BDay()) + ts.shift(5, freq=pd.offsets.BDay()) ts.shift(5, freq='BM') Rather than changing the alignment of the data and the index, ``DataFrame`` and -``Series`` objects also have a :meth:`~Series.tshift` convenience method that +``Series`` objects also have a :meth:`~Series.tshift` convenience method that changes all the dates in the index by a specified number of offsets: .. ipython:: python @@ -1522,35 +1528,35 @@ is not being realigned. Frequency Conversion ~~~~~~~~~~~~~~~~~~~~ -The primary function for changing frequencies is the :meth:`~Series.asfreq` -method. For a ``DatetimeIndex``, this is basically just a thin, but convenient -wrapper around :meth:`~Series.reindex` which generates a ``date_range`` and +The primary function for changing frequencies is the :meth:`~Series.asfreq` +method. For a ``DatetimeIndex``, this is basically just a thin, but convenient +wrapper around :meth:`~Series.reindex` which generates a ``date_range`` and calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * offsets.BDay()) - ts = pd.Series(randn(3), index=dr) + dr = pd.date_range('1/1/2010', periods=3, freq=3 * pd.offsets.BDay()) + ts = pd.Series(np.random.randn(3), index=dr) ts - ts.asfreq(BDay()) + ts.asfreq(pd.offsets.BDay()) ``asfreq`` provides a further convenience so you can specify an interpolation method for any gaps that may appear after the frequency conversion. .. ipython:: python - ts.asfreq(BDay(), method='pad') + ts.asfreq(pd.offsets.BDay(), method='pad') Filling Forward / Backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is +Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is documented in the :ref:`missing data section `. Converting to Python Datetimes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``DatetimeIndex`` can be converted to an array of Python native +``DatetimeIndex`` can be converted to an array of Python native :py:class:`datetime.datetime` objects using the ``to_pydatetime`` method. .. _timeseries.resampling: @@ -1563,13 +1569,13 @@ Resampling The interface to ``.resample`` has changed in 0.18.0 to be more groupby-like and hence more flexible. See the :ref:`whatsnew docs ` for a comparison with prior versions. -Pandas has a simple, powerful, and efficient functionality for performing -resampling operations during frequency conversion (e.g., converting secondly -data into 5-minutely data). This is extremely common in, but not limited to, +Pandas has a simple, powerful, and efficient functionality for performing +resampling operations during frequency conversion (e.g., converting secondly +data into 5-minutely data). This is extremely common in, but not limited to, financial applications. -:meth:`~Series.resample` is a time-based groupby, followed by a reduction method -on each of its groups. See some :ref:`cookbook examples ` for +:meth:`~Series.resample` is a time-based groupby, followed by a reduction method +on each of its groups. See some :ref:`cookbook examples ` for some advanced strategies. Starting in version 0.18.1, the ``resample()`` function can be used directly from @@ -1577,7 +1583,7 @@ Starting in version 0.18.1, the ``resample()`` function can be used directly fro .. note:: - ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with + ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with a time-based offset, see a discussion :ref:`here `. Basics @@ -1624,7 +1630,7 @@ labels. .. ipython:: python - ts.resample('5Min').mean() # by default label='left' + ts.resample('5Min').mean() # by default label='left' ts.resample('5Min', label='left').mean() @@ -1632,8 +1638,8 @@ labels. .. note:: - The default values for ``label`` and ``closed`` is 'left' for all - frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' + The default values for ``label`` and ``closed`` is 'left' for all + frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. .. ipython:: python @@ -1680,9 +1686,9 @@ Sparse Resampling ~~~~~~~~~~~~~~~~~ Sparse timeseries are the ones where you have a lot fewer points relative -to the amount of time you are looking to resample. Naively upsampling a sparse -series can potentially generate lots of intermediate values. When you don't want -to use a method to fill these values, e.g. ``fill_method`` is ``None``, then +to the amount of time you are looking to resample. Naively upsampling a sparse +series can potentially generate lots of intermediate values. When you don't want +to use a method to fill these values, e.g. ``fill_method`` is ``None``, then intermediate values will be filled with ``NaN``. Since ``resample`` is a time-based groupby, the following is a method to efficiently @@ -1737,7 +1743,7 @@ We can select a specific column or columns using standard getitem. r['A'].mean() - r[['A','B']].mean() + r[['A', 'B']].mean() You can pass a list or dict of functions to do aggregation with, outputting a ``DataFrame``: @@ -1758,21 +1764,21 @@ columns of a ``DataFrame``: .. ipython:: python :okexcept: - r.agg({'A' : np.sum, - 'B' : lambda x: np.std(x, ddof=1)}) + r.agg({'A': np.sum, + 'B': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the resampled object: .. ipython:: python - r.agg({'A' : 'sum', 'B' : 'std'}) + r.agg({'A': 'sum', 'B': 'std'}) Furthermore, you can also specify multiple aggregation functions for each column separately. .. ipython:: python - r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) If a ``DataFrame`` does not have a datetimelike index, but instead you want @@ -1784,9 +1790,9 @@ to resample based on datetimelike column in the frame, it can passed to the df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), 'a': np.arange(5)}, index=pd.MultiIndex.from_arrays([ - [1,2,3,4,5], - pd.date_range('2015-01-01', freq='W', periods=5)], - names=['v','d'])) + [1, 2, 3, 4, 5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v', 'd'])) df df.resample('M', on='date').sum() @@ -1845,13 +1851,13 @@ If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, .. ipython:: python p = pd.Period('2014-07-01 09:00', freq='H') - p + Hour(2) - p + timedelta(minutes=120) + p + pd.offsets.Hour(2) + p + datetime.timedelta(minutes=120) p + np.timedelta64(7200, 's') .. code-block:: ipython - In [1]: p + Minute(5) + In [1]: p + pd.offsets.Minute(5) Traceback ... ValueError: Input has different freq from Period(freq=H) @@ -1861,11 +1867,11 @@ If ``Period`` has other frequencies, only the same ``offsets`` can be added. Oth .. ipython:: python p = pd.Period('2014-07', freq='M') - p + MonthEnd(3) + p + pd.offsets.MonthEnd(3) .. code-block:: ipython - In [1]: p + MonthBegin(3) + In [1]: p + pd.offsets.MonthBegin(3) Traceback ... ValueError: Input has different freq from Period(freq=M) @@ -1923,11 +1929,11 @@ objects: idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') idx - idx + Hour(2) + idx + pd.offsets.Hour(2) idx = pd.period_range('2014-07', periods=5, freq='M') idx - idx + MonthEnd(3) + idx + pd.offsets.MonthEnd(3) ``PeriodIndex`` has its own dtype named ``period``, refer to :ref:`Period Dtypes `. @@ -1977,7 +1983,7 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI ps['2011-01'] - ps[datetime(2011, 12, 25):] + ps[datetime.datetime(2011, 12, 25):] ps['10/31/2011':'12/31/2011'] @@ -1987,9 +1993,11 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par ps['2011'] - dfp = pd.DataFrame(np.random.randn(600,1), + dfp = pd.DataFrame(np.random.randn(600, 1), columns=['A'], - index=pd.period_range('2013-01-01 9:00', periods=600, freq='T')) + index=pd.period_range('2013-01-01 9:00', + periods=600, + freq='T')) dfp dfp['2013-01-01 10H'] @@ -2133,7 +2141,8 @@ To convert from an ``int64`` based YYYYMMDD representation. s def conv(x): - return pd.Period(year = x // 10000, month = x//100 % 100, day = x%100, freq='D') + return pd.Period(year=x // 10000, month=x // 100 % 100, + day=x % 100, freq='D') s.apply(conv) s.apply(conv)[2] @@ -2177,6 +2186,8 @@ time zones by starting with ``dateutil/``. .. ipython:: python + import dateutil + # pytz rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D', tz='Europe/London') @@ -2198,6 +2209,8 @@ which gives you more control over which time zone is used: .. ipython:: python + import pytz + # pytz tz_pytz = pytz.timezone('Europe/London') rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D', @@ -2296,7 +2309,8 @@ To remove timezone from tz-aware ``DatetimeIndex``, use ``tz_localize(None)`` or .. ipython:: python - didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', + periods=10, tz='US/Eastern') didx didx.tz_localize(None) didx.tz_convert(None) @@ -2349,7 +2363,8 @@ constructor as well as ``tz_localize``. rng_hourly.tz_localize('US/Eastern', ambiguous=rng_hourly_dst).tolist() rng_hourly.tz_localize('US/Eastern', ambiguous='NaT').tolist() - didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', + periods=10, tz='US/Eastern') didx didx.tz_localize(None) didx.tz_convert(None) @@ -2357,6 +2372,40 @@ constructor as well as ``tz_localize``. # tz_convert(None) is identical with tz_convert('UTC').tz_localize(None) didx.tz_convert('UCT').tz_localize(None) +.. _timeseries.timezone_nonexistent: + +Nonexistent Times when Localizing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A DST transition may also shift the local time ahead by 1 hour creating nonexistent +local times. The behavior of localizing a timeseries with nonexistent times +can be controlled by the ``nonexistent`` argument. The following options are available: + +* ``raise``: Raises a ``pytz.NonExistentTimeError`` (the default behavior) +* ``NaT``: Replaces nonexistent times with ``NaT`` +* ``shift``: Shifts nonexistent times forward to the closest real time + +.. ipython:: python + + dti = pd.date_range(start='2015-03-29 02:30:00', periods=3, freq='H') + # 2:30 is a nonexistent time + +Localization of nonexistent times will raise an error by default. + +.. code-block:: ipython + + In [2]: dti.tz_localize('Europe/Warsaw') + NonExistentTimeError: 2015-03-29 02:30:00 + +Transform nonexistent times to ``NaT`` or the closest real time forward in time. + +.. ipython:: python + + dti + dti.tz_localize('Europe/Warsaw', nonexistent='shift') + dti.tz_localize('Europe/Warsaw', nonexistent='NaT') + + .. _timeseries.timezone_series: TZ Aware Dtypes @@ -2366,14 +2415,14 @@ TZ Aware Dtypes .. ipython:: python - s_naive = pd.Series(pd.date_range('20130101',periods=3)) + s_naive = pd.Series(pd.date_range('20130101', periods=3)) s_naive ``Series/DatetimeIndex`` with a timezone **aware** value are represented with a dtype of ``datetime64[ns, tz]``. .. ipython:: python - s_aware = pd.Series(pd.date_range('20130101',periods=3,tz='US/Eastern')) + s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) s_aware Both of these ``Series`` can be manipulated via the ``.dt`` accessor, see :ref:`here `. @@ -2401,22 +2450,22 @@ a convert on an aware stamp. .. note:: - Using the ``.values`` accessor on a ``Series``, returns an NumPy array of the data. + Using :meth:`Series.to_numpy` on a ``Series``, returns a NumPy array of the data. These values are converted to UTC, as NumPy does not currently support timezones (even though it is *printing* in the local timezone!). .. ipython:: python - s_naive.values - s_aware.values + s_naive.to_numpy() + s_aware.to_numpy() Further note that once converted to a NumPy array these would lose the tz tenor. .. ipython:: python - pd.Series(s_aware.values) + pd.Series(s_aware.to_numpy()) However, these can be easily converted: .. ipython:: python - pd.Series(s_aware.values).dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + pd.Series(s_aware.to_numpy()).dt.tz_localize('UTC').dt.tz_convert('US/Eastern') diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 381031fa128e6..83c891c0c0e40 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -7,7 +7,7 @@ Tutorials This is a guide to many pandas tutorials, geared mainly for new users. Internal Guides ---------------- +=============== pandas' own :ref:`10 Minutes to pandas<10min>`. @@ -15,6 +15,9 @@ More complex recipes are in the :ref:`Cookbook`. A handy pandas `cheat sheet `_. +Community Guides +================ + pandas Cookbook --------------- @@ -200,6 +203,5 @@ Various Tutorials * `Financial analysis in Python, by Thomas Wiecki `_ * `Intro to pandas data structures, by Greg Reda `_ * `Pandas and Python: Top 10, by Manish Amde `_ -* `Pandas Tutorial, by Mikhail Semeniuk `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ * `A concise tutorial with real life examples `_ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 569a6fb7b7a0d..050d754d0ac8b 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -6,13 +6,11 @@ import numpy as np import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 - import matplotlib - # matplotlib.style.use('default') - import matplotlib.pyplot as plt - plt.close('all') + ************* Visualization @@ -50,7 +48,8 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around .. ipython:: python - ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -69,11 +68,13 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), + index=ts.index, columns=list('ABCD')) df = df.cumsum() + plt.figure(); @savefig frame_plot_basic.png - plt.figure(); df.plot(); + df.plot(); You can plot one column versus another using the `x` and `y` keywords in :meth:`~DataFrame.plot`: @@ -355,8 +356,8 @@ more complicated colorization, you can get each drawn artists by passing .. ipython:: python - color = dict(boxes='DarkGreen', whiskers='DarkOrange', - medians='DarkBlue', caps='Gray') + color = {'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', + 'medians': 'DarkBlue', 'caps': 'Gray'} @savefig box_new_colorize.png df.plot.box(color=color, sym='r+') @@ -391,7 +392,7 @@ The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10,5)) + df = pd.DataFrame(np.random.rand(10, 5)) plt.figure(); @savefig box_plot_ex.png @@ -409,8 +410,8 @@ groupings. For instance, .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) - df['X'] = pd.Series(['A','A','A','A','A','B','B','B','B','B']) + df = pd.DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) + df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) plt.figure(); @@ -429,14 +430,14 @@ columns: .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10,3), columns=['Col1', 'Col2', 'Col3']) - df['X'] = pd.Series(['A','A','A','A','A','B','B','B','B','B']) - df['Y'] = pd.Series(['A','B','A','B','A','B','A','B','A','B']) + df = pd.DataFrame(np.random.rand(10, 3), columns=['Col1', 'Col2', 'Col3']) + df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']) plt.figure(); @savefig box_plot_ex3.png - bp = df.boxplot(column=['Col1','Col2'], by=['X','Y']) + bp = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) .. ipython:: python :suppress: @@ -594,7 +595,7 @@ bubble chart using a column of the ``DataFrame`` as the bubble size. .. ipython:: python @savefig scatter_plot_bubble.png - df.plot.scatter(x='a', y='b', s=df['c']*200); + df.plot.scatter(x='a', y='b', s=df['c'] * 200); .. ipython:: python :suppress: @@ -654,8 +655,7 @@ given by column ``z``. The bins are aggregated with NumPy's ``max`` function. df['z'] = np.random.uniform(0, 3, 1000) @savefig hexbin_plot_agg.png - df.plot.hexbin(x='a', y='b', C='z', reduce_C_function=np.max, - gridsize=25) + df.plot.hexbin(x='a', y='b', C='z', reduce_C_function=np.max, gridsize=25) .. ipython:: python :suppress: @@ -682,7 +682,8 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python - series = pd.Series(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], name='series') + series = pd.Series(3 * np.random.rand(4), + index=['a', 'b', 'c', 'd'], name='series') @savefig series_pie_plot.png series.plot.pie(figsize=(6, 6)) @@ -711,7 +712,8 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python - df = pd.DataFrame(3 * np.random.rand(4, 2), index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + df = pd.DataFrame(3 * np.random.rand(4, 2), + index=['a', 'b', 'c', 'd'], columns=['x', 'y']) @savefig df_pie_plot.png df.plot.pie(subplots=True, figsize=(8, 4)) @@ -939,8 +941,8 @@ be passed, and when ``lag=1`` the plot is essentially ``data[:-1]`` vs. plt.figure() - data = pd.Series(0.1 * np.random.rand(1000) + - 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) + spacing = np.linspace(-99 * np.pi, 99 * np.pi, num=1000) + data = pd.Series(0.1 * np.random.rand(1000) + 0.9 * np.sin(spacing)) @savefig lag_plot.png lag_plot(data) @@ -976,8 +978,8 @@ autocorrelation plots. plt.figure() - data = pd.Series(0.7 * np.random.rand(1000) + - 0.3 * np.sin(np.linspace(-9 * np.pi, 9 * np.pi, num=1000))) + spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000) + data = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing)) @savefig autocorrelation_plot.png autocorrelation_plot(data) @@ -1078,8 +1080,9 @@ layout and formatting of the returned plot: .. ipython:: python + plt.figure(); @savefig series_plot_basic2.png - plt.figure(); ts.plot(style='k--', label='Series'); + ts.plot(style='k--', label='Series'); .. ipython:: python :suppress: @@ -1106,7 +1109,8 @@ shown by default. .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), + index=ts.index, columns=list('ABCD')) df = df.cumsum() @savefig frame_plot_basic_noleg.png @@ -1130,7 +1134,8 @@ You may pass ``logy`` to get a log-scale Y axis. .. ipython:: python - ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) ts = np.exp(ts.cumsum()) @savefig series_plot_logy.png @@ -1326,14 +1331,15 @@ otherwise you will see a warning. .. ipython:: python - fig, axes = plt.subplots(4, 4, figsize=(6, 6)); - plt.subplots_adjust(wspace=0.5, hspace=0.5); + fig, axes = plt.subplots(4, 4, figsize=(6, 6)) + plt.subplots_adjust(wspace=0.5, hspace=0.5) target1 = [axes[0][0], axes[1][1], axes[2][2], axes[3][3]] target2 = [axes[3][0], axes[2][1], axes[1][2], axes[0][3]] df.plot(subplots=True, ax=target1, legend=False, sharex=False, sharey=False); @savefig frame_plot_subplots_multi_ax.png - (-df).plot(subplots=True, ax=target2, legend=False, sharex=False, sharey=False); + (-df).plot(subplots=True, ax=target2, legend=False, + sharex=False, sharey=False); .. ipython:: python :suppress: @@ -1346,10 +1352,12 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a :suppress: np.random.seed(123456) - ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() - df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, + columns=list('ABCD')) df = df.cumsum() .. ipython:: python @@ -1360,12 +1368,15 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a .. ipython:: python fig, axes = plt.subplots(nrows=2, ncols=2) - df['A'].plot(ax=axes[0,0]); axes[0,0].set_title('A'); - df['B'].plot(ax=axes[0,1]); axes[0,1].set_title('B'); - df['C'].plot(ax=axes[1,0]); axes[1,0].set_title('C'); - + df['A'].plot(ax=axes[0, 0]); + axes[0, 0].set_title('A'); + df['B'].plot(ax=axes[0, 1]); + axes[0, 1].set_title('B'); + df['C'].plot(ax=axes[1, 0]); + axes[1, 0].set_title('C'); + df['D'].plot(ax=axes[1, 1]); @savefig series_plot_multi.png - df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D'); + axes[1, 1].set_title('D'); .. ipython:: python :suppress: @@ -1392,10 +1403,16 @@ Here is an example of one way to easily plot group means with standard deviation .. ipython:: python # Generate the data - ix3 = pd.MultiIndex.from_arrays([['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'], ['foo', 'foo', 'bar', 'bar', 'foo', 'foo', 'bar', 'bar']], names=['letter', 'word']) - df3 = pd.DataFrame({'data1': [3, 2, 4, 3, 2, 4, 3, 2], 'data2': [6, 5, 7, 5, 4, 5, 6, 5]}, index=ix3) + ix3 = pd.MultiIndex.from_arrays([ + ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'], + ['foo', 'foo', 'bar', 'bar', 'foo', 'foo', 'bar', 'bar']], + names=['letter', 'word']) + + df3 = pd.DataFrame({'data1': [3, 2, 4, 3, 2, 4, 3, 2], + 'data2': [6, 5, 7, 5, 4, 5, 6, 5]}, index=ix3) - # Group by index labels and take the means and standard deviations for each group + # Group by index labels and take the means and standard deviations + # for each group gp3 = df3.groupby(level=('letter', 'word')) means = gp3.mean() errors = gp3.std() @@ -1405,7 +1422,7 @@ Here is an example of one way to easily plot group means with standard deviation # Plot fig, ax = plt.subplots() @savefig errorbar_example.png - means.plot.bar(yerr=errors, ax=ax) + means.plot.bar(yerr=errors, ax=ax, capsize=4) .. ipython:: python :suppress: @@ -1616,7 +1633,8 @@ when plotting a large number of points. plt.plot(price.index, price, 'k') plt.plot(ma.index, ma, 'b') @savefig bollinger.png - plt.fill_between(mstd.index, ma-2*mstd, ma+2*mstd, color='b', alpha=0.2) + plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, + color='b', alpha=0.2) .. ipython:: python :suppress: diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst deleted file mode 100644 index 8672685b3ebb4..0000000000000 --- a/doc/source/whatsnew.rst +++ /dev/null @@ -1,109 +0,0 @@ -.. _whatsnew: - -.. currentmodule:: pandas - -.. ipython:: python - :suppress: - - import numpy as np - from pandas import * - import pandas as pd - randn = np.random.randn - np.set_printoptions(precision=4, suppress=True) - options.display.max_rows = 15 - -********** -What's New -********** - -These are new features and improvements of note in each release. - -.. include:: whatsnew/v0.24.0.txt - -.. include:: whatsnew/v0.23.4.txt - -.. include:: whatsnew/v0.23.3.txt - -.. include:: whatsnew/v0.23.2.txt - -.. include:: whatsnew/v0.23.1.txt - -.. include:: whatsnew/v0.23.0.txt - -.. include:: whatsnew/v0.22.0.txt - -.. include:: whatsnew/v0.21.1.txt - -.. include:: whatsnew/v0.21.0.txt - -.. include:: whatsnew/v0.20.3.txt - -.. include:: whatsnew/v0.20.2.txt - -.. include:: whatsnew/v0.20.0.txt - -.. include:: whatsnew/v0.19.2.txt - -.. include:: whatsnew/v0.19.1.txt - -.. include:: whatsnew/v0.19.0.txt - -.. include:: whatsnew/v0.18.1.txt - -.. include:: whatsnew/v0.18.0.txt - -.. include:: whatsnew/v0.17.1.txt - -.. include:: whatsnew/v0.17.0.txt - -.. include:: whatsnew/v0.16.2.txt - -.. include:: whatsnew/v0.16.1.txt - -.. include:: whatsnew/v0.16.0.txt - -.. include:: whatsnew/v0.15.2.txt - -.. include:: whatsnew/v0.15.1.txt - -.. include:: whatsnew/v0.15.0.txt - -.. include:: whatsnew/v0.14.1.txt - -.. include:: whatsnew/v0.14.0.txt - -.. include:: whatsnew/v0.13.1.txt - -.. include:: whatsnew/v0.13.0.txt - -.. include:: whatsnew/v0.12.0.txt - -.. include:: whatsnew/v0.11.0.txt - -.. include:: whatsnew/v0.10.1.txt - -.. include:: whatsnew/v0.10.0.txt - -.. include:: whatsnew/v0.9.1.txt - -.. include:: whatsnew/v0.9.0.txt - -.. include:: whatsnew/v0.8.1.txt - -.. include:: whatsnew/v0.8.0.txt - -.. include:: whatsnew/v0.7.3.txt - -.. include:: whatsnew/v0.7.2.txt - -.. include:: whatsnew/v0.7.1.txt - -.. include:: whatsnew/v0.7.0.txt - -.. include:: whatsnew/v0.6.1.txt - -.. include:: whatsnew/v0.6.0.txt - -.. include:: whatsnew/v0.5.0.txt - -.. include:: whatsnew/v0.4.x.txt diff --git a/doc/source/whatsnew/v0.10.0.txt b/doc/source/whatsnew/v0.10.0.rst similarity index 99% rename from doc/source/whatsnew/v0.10.0.txt rename to doc/source/whatsnew/v0.10.0.rst index 298088a4f96b3..27f20111dbf96 100644 --- a/doc/source/whatsnew/v0.10.0.txt +++ b/doc/source/whatsnew/v0.10.0.rst @@ -1,13 +1,10 @@ .. _whatsnew_0100: -.. ipython:: python - :suppress: - - from pandas.compat import StringIO - v0.10.0 (December 17, 2012) --------------------------- +{{ common_imports }} + This is a major release from 0.9.1 and includes many new features and enhancements along with a large number of bug fixes. There are also a number of important API changes that long-time pandas users should pay close attention @@ -431,3 +428,11 @@ Here is a taste of what to expect. See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. + + +.. _whatsnew_0.10.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.9.0..v0.10.0 diff --git a/doc/source/whatsnew/v0.10.1.txt b/doc/source/whatsnew/v0.10.1.rst similarity index 98% rename from doc/source/whatsnew/v0.10.1.txt rename to doc/source/whatsnew/v0.10.1.rst index f1a32440c6950..5679babf07b73 100644 --- a/doc/source/whatsnew/v0.10.1.txt +++ b/doc/source/whatsnew/v0.10.1.rst @@ -3,6 +3,8 @@ v0.10.1 (January 22, 2013) --------------------------- +{{ common_imports }} + This is a minor release from 0.10.0 and includes new features, enhancements, and bug fixes. In particular, there is substantial new HDFStore functionality contributed by Jeff Reback. @@ -208,3 +210,11 @@ combined result, by using ``where`` on a selector table. See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. + + +.. _whatsnew_0.10.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.10.0..v0.10.1 diff --git a/doc/source/whatsnew/v0.11.0.txt b/doc/source/whatsnew/v0.11.0.rst similarity index 98% rename from doc/source/whatsnew/v0.11.0.txt rename to doc/source/whatsnew/v0.11.0.rst index f39e6c9ff459b..051d735e539aa 100644 --- a/doc/source/whatsnew/v0.11.0.txt +++ b/doc/source/whatsnew/v0.11.0.rst @@ -3,6 +3,8 @@ v0.11.0 (April 22, 2013) ------------------------ +{{ common_imports }} + This is a major release from 0.10.1 and includes many new features and enhancements along with a large number of bug fixes. The methods of Selecting Data have had quite a number of additions, and Dtype support is now full-fledged. @@ -330,3 +332,11 @@ Enhancements See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. + + +.. _whatsnew_0.11.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.10.1..v0.11.0 diff --git a/doc/source/whatsnew/v0.12.0.txt b/doc/source/whatsnew/v0.12.0.rst similarity index 99% rename from doc/source/whatsnew/v0.12.0.txt rename to doc/source/whatsnew/v0.12.0.rst index f66f6c0f72d5d..a462359b6e3c0 100644 --- a/doc/source/whatsnew/v0.12.0.txt +++ b/doc/source/whatsnew/v0.12.0.rst @@ -3,6 +3,8 @@ v0.12.0 (July 24, 2013) ------------------------ +{{ common_imports }} + This is a major release from 0.11.0 and includes several new features and enhancements along with a large number of bug fixes. @@ -504,3 +506,11 @@ Bug Fixes See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. + + +.. _whatsnew_0.12.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.11.0..v0.12.0 diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.rst similarity index 66% rename from doc/source/whatsnew/v0.13.0.txt rename to doc/source/whatsnew/v0.13.0.rst index 94cd451196ead..037347afb1d59 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.rst @@ -3,6 +3,8 @@ v0.13.0 (January 3, 2014) --------------------------- +{{ common_imports }} + This is a major release from 0.12.0 and includes a number of API changes, several new features and enhancements along with a large number of bug fixes. @@ -425,7 +427,7 @@ than switching to the short info view (:issue:`4886`, :issue:`5550`). This makes the representation more consistent as small DataFrames get larger. -.. image:: _static/df_repr_truncated.png +.. image:: ../_static/df_repr_truncated.png :alt: Truncated HTML representation of a DataFrame To get the info view, call :meth:`DataFrame.info`. If you prefer the @@ -976,11 +978,308 @@ to unify methods and behaviors. Series formerly subclassed directly from s.a = 5 s +.. _release.bug_fixes-0.13.0: + Bug Fixes ~~~~~~~~~ -See :ref:`V0.13.0 Bug Fixes` for an extensive list of bugs that have been fixed in 0.13.0. +- ``HDFStore`` + + - raising an invalid ``TypeError`` rather than ``ValueError`` when + appending with a different block ordering (:issue:`4096`) + - ``read_hdf`` was not respecting as passed ``mode`` (:issue:`4504`) + - appending a 0-len table will work correctly (:issue:`4273`) + - ``to_hdf`` was raising when passing both arguments ``append`` and + ``table`` (:issue:`4584`) + - reading from a store with duplicate columns across dtypes would raise + (:issue:`4767`) + - Fixed a bug where ``ValueError`` wasn't correctly raised when column + names weren't strings (:issue:`4956`) + - A zero length series written in Fixed format not deserializing properly. + (:issue:`4708`) + - Fixed decoding perf issue on pyt3 (:issue:`5441`) + - Validate levels in a MultiIndex before storing (:issue:`5527`) + - Correctly handle ``data_columns`` with a Panel (:issue:`5717`) +- Fixed bug in tslib.tz_convert(vals, tz1, tz2): it could raise IndexError + exception while trying to access trans[pos + 1] (:issue:`4496`) +- The ``by`` argument now works correctly with the ``layout`` argument + (:issue:`4102`, :issue:`4014`) in ``*.hist`` plotting methods +- Fixed bug in ``PeriodIndex.map`` where using ``str`` would return the str + representation of the index (:issue:`4136`) +- Fixed test failure ``test_time_series_plot_color_with_empty_kwargs`` when + using custom matplotlib default colors (:issue:`4345`) +- Fix running of stata IO tests. Now uses temporary files to write + (:issue:`4353`) +- Fixed an issue where ``DataFrame.sum`` was slower than ``DataFrame.mean`` + for integer valued frames (:issue:`4365`) +- ``read_html`` tests now work with Python 2.6 (:issue:`4351`) +- Fixed bug where ``network`` testing was throwing ``NameError`` because a + local variable was undefined (:issue:`4381`) +- In ``to_json``, raise if a passed ``orient`` would cause loss of data + because of a duplicate index (:issue:`4359`) +- In ``to_json``, fix date handling so milliseconds are the default timestamp + as the docstring says (:issue:`4362`). +- ``as_index`` is no longer ignored when doing groupby apply (:issue:`4648`, + :issue:`3417`) +- JSON NaT handling fixed, NaTs are now serialized to `null` (:issue:`4498`) +- Fixed JSON handling of escapable characters in JSON object keys + (:issue:`4593`) +- Fixed passing ``keep_default_na=False`` when ``na_values=None`` + (:issue:`4318`) +- Fixed bug with ``values`` raising an error on a DataFrame with duplicate + columns and mixed dtypes, surfaced in (:issue:`4377`) +- Fixed bug with duplicate columns and type conversion in ``read_json`` when + ``orient='split'`` (:issue:`4377`) +- Fixed JSON bug where locales with decimal separators other than '.' threw + exceptions when encoding / decoding certain values. (:issue:`4918`) +- Fix ``.iat`` indexing with a ``PeriodIndex`` (:issue:`4390`) +- Fixed an issue where ``PeriodIndex`` joining with self was returning a new + instance rather than the same instance (:issue:`4379`); also adds a test + for this for the other index types +- Fixed a bug with all the dtypes being converted to object when using the + CSV cparser with the usecols parameter (:issue:`3192`) +- Fix an issue in merging blocks where the resulting DataFrame had partially + set _ref_locs (:issue:`4403`) +- Fixed an issue where hist subplots were being overwritten when they were + called using the top level matplotlib API (:issue:`4408`) +- Fixed a bug where calling ``Series.astype(str)`` would truncate the string + (:issue:`4405`, :issue:`4437`) +- Fixed a py3 compat issue where bytes were being repr'd as tuples + (:issue:`4455`) +- Fixed Panel attribute naming conflict if item is named 'a' + (:issue:`3440`) +- Fixed an issue where duplicate indexes were raising when plotting + (:issue:`4486`) +- Fixed an issue where cumsum and cumprod didn't work with bool dtypes + (:issue:`4170`, :issue:`4440`) +- Fixed Panel slicing issued in ``xs`` that was returning an incorrect dimmed + object (:issue:`4016`) +- Fix resampling bug where custom reduce function not used if only one group + (:issue:`3849`, :issue:`4494`) +- Fixed Panel assignment with a transposed frame (:issue:`3830`) +- Raise on set indexing with a Panel and a Panel as a value which needs + alignment (:issue:`3777`) +- frozenset objects now raise in the ``Series`` constructor (:issue:`4482`, + :issue:`4480`) +- Fixed issue with sorting a duplicate MultiIndex that has multiple dtypes + (:issue:`4516`) +- Fixed bug in ``DataFrame.set_values`` which was causing name attributes to + be lost when expanding the index. (:issue:`3742`, :issue:`4039`) +- Fixed issue where individual ``names``, ``levels`` and ``labels`` could be + set on ``MultiIndex`` without validation (:issue:`3714`, :issue:`4039`) +- Fixed (:issue:`3334`) in pivot_table. Margins did not compute if values is + the index. +- Fix bug in having a rhs of ``np.timedelta64`` or ``np.offsets.DateOffset`` + when operating with datetimes (:issue:`4532`) +- Fix arithmetic with series/datetimeindex and ``np.timedelta64`` not working + the same (:issue:`4134`) and buggy timedelta in NumPy 1.6 (:issue:`4135`) +- Fix bug in ``pd.read_clipboard`` on windows with PY3 (:issue:`4561`); not + decoding properly +- ``tslib.get_period_field()`` and ``tslib.get_period_field_arr()`` now raise + if code argument out of range (:issue:`4519`, :issue:`4520`) +- Fix boolean indexing on an empty series loses index names (:issue:`4235`), + infer_dtype works with empty arrays. +- Fix reindexing with multiple axes; if an axes match was not replacing the + current axes, leading to a possible lazy frequency inference issue + (:issue:`3317`) +- Fixed issue where ``DataFrame.apply`` was reraising exceptions incorrectly + (causing the original stack trace to be truncated). +- Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`) +- Fix assignment with iloc/loc involving a dtype change in an existing column + (:issue:`4312`, :issue:`5702`) have internal setitem_with_indexer in core/indexing + to use Block.setitem +- Fixed bug where thousands operator was not handled correctly for floating + point numbers in csv_import (:issue:`4322`) +- Fix an issue with CacheableOffset not properly being used by many + DateOffset; this prevented the DateOffset from being cached (:issue:`4609`) +- Fix boolean comparison with a DataFrame on the lhs, and a list/tuple on the + rhs (:issue:`4576`) +- Fix error/dtype conversion with setitem of ``None`` on ``Series/DataFrame`` + (:issue:`4667`) +- Fix decoding based on a passed in non-default encoding in ``pd.read_stata`` + (:issue:`4626`) +- Fix ``DataFrame.from_records`` with a plain-vanilla ``ndarray``. + (:issue:`4727`) +- Fix some inconsistencies with ``Index.rename`` and ``MultiIndex.rename``, + etc. (:issue:`4718`, :issue:`4628`) +- Bug in using ``iloc/loc`` with a cross-sectional and duplicate indices + (:issue:`4726`) +- Bug with using ``QUOTE_NONE`` with ``to_csv`` causing ``Exception``. + (:issue:`4328`) +- Bug with Series indexing not raising an error when the right-hand-side has + an incorrect length (:issue:`2702`) +- Bug in MultiIndexing with a partial string selection as one part of a + MultIndex (:issue:`4758`) +- Bug with reindexing on the index with a non-unique index will now raise + ``ValueError`` (:issue:`4746`) +- Bug in setting with ``loc/ix`` a single indexer with a MultiIndex axis and + a NumPy array, related to (:issue:`3777`) +- Bug in concatenation with duplicate columns across dtypes not merging with + axis=0 (:issue:`4771`, :issue:`4975`) +- Bug in ``iloc`` with a slice index failing (:issue:`4771`) +- Incorrect error message with no colspecs or width in ``read_fwf``. + (:issue:`4774`) +- Fix bugs in indexing in a Series with a duplicate index (:issue:`4548`, + :issue:`4550`) +- Fixed bug with reading compressed files with ``read_fwf`` in Python 3. + (:issue:`3963`) +- Fixed an issue with a duplicate index and assignment with a dtype change + (:issue:`4686`) +- Fixed bug with reading compressed files in as ``bytes`` rather than ``str`` + in Python 3. Simplifies bytes-producing file-handling in Python 3 + (:issue:`3963`, :issue:`4785`). +- Fixed an issue related to ticklocs/ticklabels with log scale bar plots + across different versions of matplotlib (:issue:`4789`) +- Suppressed DeprecationWarning associated with internal calls issued by + repr() (:issue:`4391`) +- Fixed an issue with a duplicate index and duplicate selector with ``.loc`` + (:issue:`4825`) +- Fixed an issue with ``DataFrame.sort_index`` where, when sorting by a + single column and passing a list for ``ascending``, the argument for + ``ascending`` was being interpreted as ``True`` (:issue:`4839`, + :issue:`4846`) +- Fixed ``Panel.tshift`` not working. Added `freq` support to ``Panel.shift`` + (:issue:`4853`) +- Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser) + with thousands != "," (:issue:`4596`) +- Bug in getitem with a duplicate index when using where (:issue:`4879`) +- Fix Type inference code coerces float column into datetime (:issue:`4601`) +- Fixed ``_ensure_numeric`` does not check for complex numbers + (:issue:`4902`) +- Fixed a bug in ``Series.hist`` where two figures were being created when + the ``by`` argument was passed (:issue:`4112`, :issue:`4113`). +- Fixed a bug in ``convert_objects`` for > 2 ndims (:issue:`4937`) +- Fixed a bug in DataFrame/Panel cache insertion and subsequent indexing + (:issue:`4939`, :issue:`5424`) +- Fixed string methods for ``FrozenNDArray`` and ``FrozenList`` + (:issue:`4929`) +- Fixed a bug with setting invalid or out-of-range values in indexing + enlargement scenarios (:issue:`4940`) +- Tests for fillna on empty Series (:issue:`4346`), thanks @immerrr +- Fixed ``copy()`` to shallow copy axes/indices as well and thereby keep + separate metadata. (:issue:`4202`, :issue:`4830`) +- Fixed skiprows option in Python parser for read_csv (:issue:`4382`) +- Fixed bug preventing ``cut`` from working with ``np.inf`` levels without + explicitly passing labels (:issue:`3415`) +- Fixed wrong check for overlapping in ``DatetimeIndex.union`` + (:issue:`4564`) +- Fixed conflict between thousands separator and date parser in csv_parser + (:issue:`4678`) +- Fix appending when dtypes are not the same (error showing mixing + float/np.datetime64) (:issue:`4993`) +- Fix repr for DateOffset. No longer show duplicate entries in kwds. + Removed unused offset fields. (:issue:`4638`) +- Fixed wrong index name during read_csv if using usecols. Applies to c + parser only. (:issue:`4201`) +- ``Timestamp`` objects can now appear in the left hand side of a comparison + operation with a ``Series`` or ``DataFrame`` object (:issue:`4982`). +- Fix a bug when indexing with ``np.nan`` via ``iloc/loc`` (:issue:`5016`) +- Fixed a bug where low memory c parser could create different types in + different chunks of the same file. Now coerces to numerical type or raises + warning. (:issue:`3866`) +- Fix a bug where reshaping a ``Series`` to its own shape raised + ``TypeError`` (:issue:`4554`) and other reshaping issues. +- Bug in setting with ``ix/loc`` and a mixed int/string index (:issue:`4544`) +- Make sure series-series boolean comparisons are label based (:issue:`4947`) +- Bug in multi-level indexing with a Timestamp partial indexer + (:issue:`4294`) +- Tests/fix for MultiIndex construction of an all-nan frame (:issue:`4078`) +- Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring + values of tables with commas (:issue:`5029`) +- Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable + ordering of returned tables (:issue:`4770`, :issue:`5029`). +- Fixed a bug where :func:`~pandas.read_html` was incorrectly parsing when + passed ``index_col=0`` (:issue:`5066`). +- Fixed a bug where :func:`~pandas.read_html` was incorrectly inferring the + type of headers (:issue:`5048`). +- Fixed a bug where ``DatetimeIndex`` joins with ``PeriodIndex`` caused a + stack overflow (:issue:`3899`). +- Fixed a bug where ``groupby`` objects didn't allow plots (:issue:`5102`). +- Fixed a bug where ``groupby`` objects weren't tab-completing column names + (:issue:`5102`). +- Fixed a bug where ``groupby.plot()`` and friends were duplicating figures + multiple times (:issue:`5102`). +- Provide automatic conversion of ``object`` dtypes on fillna, related + (:issue:`5103`) +- Fixed a bug where default options were being overwritten in the option + parser cleaning (:issue:`5121`). +- Treat a list/ndarray identically for ``iloc`` indexing with list-like + (:issue:`5006`) +- Fix ``MultiIndex.get_level_values()`` with missing values (:issue:`5074`) +- Fix bound checking for Timestamp() with datetime64 input (:issue:`4065`) +- Fix a bug where ``TestReadHtml`` wasn't calling the correct ``read_html()`` + function (:issue:`5150`). +- Fix a bug with ``NDFrame.replace()`` which made replacement appear as + though it was (incorrectly) using regular expressions (:issue:`5143`). +- Fix better error message for to_datetime (:issue:`4928`) +- Made sure different locales are tested on travis-ci (:issue:`4918`). Also + adds a couple of utilities for getting locales and setting locales with a + context manager. +- Fixed segfault on ``isnull(MultiIndex)`` (now raises an error instead) + (:issue:`5123`, :issue:`5125`) +- Allow duplicate indices when performing operations that align + (:issue:`5185`, :issue:`5639`) +- Compound dtypes in a constructor raise ``NotImplementedError`` + (:issue:`5191`) +- Bug in comparing duplicate frames (:issue:`4421`) related +- Bug in describe on duplicate frames +- Bug in ``to_datetime`` with a format and ``coerce=True`` not raising + (:issue:`5195`) +- Bug in ``loc`` setting with multiple indexers and a rhs of a Series that + needs broadcasting (:issue:`5206`) +- Fixed bug where inplace setting of levels or labels on ``MultiIndex`` would + not clear cached ``values`` property and therefore return wrong ``values``. + (:issue:`5215`) +- Fixed bug where filtering a grouped DataFrame or Series did not maintain + the original ordering (:issue:`4621`). +- Fixed ``Period`` with a business date freq to always roll-forward if on a + non-business date. (:issue:`5203`) +- Fixed bug in Excel writers where frames with duplicate column names weren't + written correctly. (:issue:`5235`) +- Fixed issue with ``drop`` and a non-unique index on Series (:issue:`5248`) +- Fixed segfault in C parser caused by passing more names than columns in + the file. (:issue:`5156`) +- Fix ``Series.isin`` with date/time-like dtypes (:issue:`5021`) +- C and Python Parser can now handle the more common MultiIndex column + format which doesn't have a row for index names (:issue:`4702`) +- Bug when trying to use an out-of-bounds date as an object dtype + (:issue:`5312`) +- Bug when trying to display an embedded PandasObject (:issue:`5324`) +- Allows operating of Timestamps to return a datetime if the result is out-of-bounds + related (:issue:`5312`) +- Fix return value/type signature of ``initObjToJSON()`` to be compatible + with numpy's ``import_array()`` (:issue:`5334`, :issue:`5326`) +- Bug when renaming then set_index on a DataFrame (:issue:`5344`) +- Test suite no longer leaves around temporary files when testing graphics. (:issue:`5347`) + (thanks for catching this @yarikoptic!) +- Fixed html tests on win32. (:issue:`4580`) +- Make sure that ``head/tail`` are ``iloc`` based, (:issue:`5370`) +- Fixed bug for ``PeriodIndex`` string representation if there are 1 or 2 + elements. (:issue:`5372`) +- The GroupBy methods ``transform`` and ``filter`` can be used on Series + and DataFrames that have repeated (non-unique) indices. (:issue:`4620`) +- Fix empty series not printing name in repr (:issue:`4651`) +- Make tests create temp files in temp directory by default. (:issue:`5419`) +- ``pd.to_timedelta`` of a scalar returns a scalar (:issue:`5410`) +- ``pd.to_timedelta`` accepts ``NaN`` and ``NaT``, returning ``NaT`` instead of raising (:issue:`5437`) +- performance improvements in ``isnull`` on larger size pandas objects +- Fixed various setitem with 1d ndarray that does not have a matching + length to the indexer (:issue:`5508`) +- Bug in getitem with a MultiIndex and ``iloc`` (:issue:`5528`) +- Bug in delitem on a Series (:issue:`5542`) +- Bug fix in apply when using custom function and objects are not mutated (:issue:`5545`) +- Bug in selecting from a non-unique index with ``loc`` (:issue:`5553`) +- Bug in groupby returning non-consistent types when user function returns a ``None``, (:issue:`5592`) +- Work around regression in numpy 1.7.0 which erroneously raises IndexError from ``ndarray.item`` (:issue:`5666`) +- Bug in repeated indexing of object with resultant non-unique index (:issue:`5678`) +- Bug in fillna with Series and a passed series/dict (:issue:`5703`) +- Bug in groupby transform with a datetime-like grouper (:issue:`5712`) +- Bug in MultiIndex selection in PY3 when using certain keys (:issue:`5725`) +- Row-wise concat of differing dtypes failing in certain cases (:issue:`5754`) + +.. _whatsnew_0.13.0.contributors: + +Contributors +~~~~~~~~~~~~ -See the :ref:`full release notes -` or issue tracker -on GitHub for a complete list of all API changes, Enhancements and Bug Fixes. +.. contributors:: v0.12.0..v0.13.0 diff --git a/doc/source/whatsnew/v0.13.1.txt b/doc/source/whatsnew/v0.13.1.rst similarity index 64% rename from doc/source/whatsnew/v0.13.1.txt rename to doc/source/whatsnew/v0.13.1.rst index a4807a6d61b76..6a1b578cc08fb 100644 --- a/doc/source/whatsnew/v0.13.1.txt +++ b/doc/source/whatsnew/v0.13.1.rst @@ -3,6 +3,8 @@ v0.13.1 (February 3, 2014) -------------------------- +{{ common_imports }} + This is a minor release from 0.13.0 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -126,10 +128,6 @@ API changes df.equals(df2) df.equals(df2.sort_index()) - import pandas.core.common as com - com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) - np.array_equal(np.array([0, np.nan]), np.array([0, np.nan])) - - ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is empty (:issue:`6007`). @@ -296,11 +294,86 @@ Experimental There are no experimental changes in 0.13.1 +.. _release.bug_fixes-0.13.1: + Bug Fixes ~~~~~~~~~ -See :ref:`V0.13.1 Bug Fixes` for an extensive list of bugs that have been fixed in 0.13.1. +- Bug in ``io.wb.get_countries`` not including all countries (:issue:`6008`) +- Bug in Series replace with timestamp dict (:issue:`5797`) +- read_csv/read_table now respects the `prefix` kwarg (:issue:`5732`). +- Bug in selection with missing values via ``.ix`` from a duplicate indexed DataFrame failing (:issue:`5835`) +- Fix issue of boolean comparison on empty DataFrames (:issue:`5808`) +- Bug in isnull handling ``NaT`` in an object array (:issue:`5443`) +- Bug in ``to_datetime`` when passed a ``np.nan`` or integer datelike and a format string (:issue:`5863`) +- Bug in groupby dtype conversion with datetimelike (:issue:`5869`) +- Regression in handling of empty Series as indexers to Series (:issue:`5877`) +- Bug in internal caching, related to (:issue:`5727`) +- Testing bug in reading JSON/msgpack from a non-filepath on windows under py3 (:issue:`5874`) +- Bug when assigning to .ix[tuple(...)] (:issue:`5896`) +- Bug in fully reindexing a Panel (:issue:`5905`) +- Bug in idxmin/max with object dtypes (:issue:`5914`) +- Bug in ``BusinessDay`` when adding n days to a date not on offset when n>5 and n%5==0 (:issue:`5890`) +- Bug in assigning to chained series with a series via ix (:issue:`5928`) +- Bug in creating an empty DataFrame, copying, then assigning (:issue:`5932`) +- Bug in DataFrame.tail with empty frame (:issue:`5846`) +- Bug in propagating metadata on ``resample`` (:issue:`5862`) +- Fixed string-representation of ``NaT`` to be "NaT" (:issue:`5708`) +- Fixed string-representation for Timestamp to show nanoseconds if present (:issue:`5912`) +- ``pd.match`` not returning passed sentinel +- ``Panel.to_frame()`` no longer fails when ``major_axis`` is a + ``MultiIndex`` (:issue:`5402`). +- Bug in ``pd.read_msgpack`` with inferring a ``DateTimeIndex`` frequency + incorrectly (:issue:`5947`) +- Fixed ``to_datetime`` for array with both Tz-aware datetimes and ``NaT``'s (:issue:`5961`) +- Bug in rolling skew/kurtosis when passed a Series with bad data (:issue:`5749`) +- Bug in scipy ``interpolate`` methods with a datetime index (:issue:`5975`) +- Bug in NaT comparison if a mixed datetime/np.datetime64 with NaT were passed (:issue:`5968`) +- Fixed bug with ``pd.concat`` losing dtype information if all inputs are empty (:issue:`5742`) +- Recent changes in IPython cause warnings to be emitted when using previous versions + of pandas in QTConsole, now fixed. If you're using an older version and + need to suppress the warnings, see (:issue:`5922`). +- Bug in merging ``timedelta`` dtypes (:issue:`5695`) +- Bug in plotting.scatter_matrix function. Wrong alignment among diagonal + and off-diagonal plots, see (:issue:`5497`). +- Regression in Series with a MultiIndex via ix (:issue:`6018`) +- Bug in Series.xs with a MultiIndex (:issue:`6018`) +- Bug in Series construction of mixed type with datelike and an integer (which should result in + object type and not automatic conversion) (:issue:`6028`) +- Possible segfault when chained indexing with an object array under NumPy 1.7.1 (:issue:`6026`, :issue:`6056`) +- Bug in setting using fancy indexing a single element with a non-scalar (e.g. a list), + (:issue:`6043`) +- ``to_sql`` did not respect ``if_exists`` (:issue:`4110` :issue:`4304`) +- Regression in ``.get(None)`` indexing from 0.12 (:issue:`5652`) +- Subtle ``iloc`` indexing bug, surfaced in (:issue:`6059`) +- Bug with insert of strings into DatetimeIndex (:issue:`5818`) +- Fixed unicode bug in to_html/HTML repr (:issue:`6098`) +- Fixed missing arg validation in get_options_data (:issue:`6105`) +- Bug in assignment with duplicate columns in a frame where the locations + are a slice (e.g. next to each other) (:issue:`6120`) +- Bug in propagating _ref_locs during construction of a DataFrame with dups + index/columns (:issue:`6121`) +- Bug in ``DataFrame.apply`` when using mixed datelike reductions (:issue:`6125`) +- Bug in ``DataFrame.append`` when appending a row with different columns (:issue:`6129`) +- Bug in DataFrame construction with recarray and non-ns datetime dtype (:issue:`6140`) +- Bug in ``.loc`` setitem indexing with a dataframe on rhs, multiple item setting, and + a datetimelike (:issue:`6152`) +- Fixed a bug in ``query``/``eval`` during lexicographic string comparisons (:issue:`6155`). +- Fixed a bug in ``query`` where the index of a single-element ``Series`` was + being thrown away (:issue:`6148`). +- Bug in ``HDFStore`` on appending a dataframe with MultiIndexed columns to + an existing table (:issue:`6167`) +- Consistency with dtypes in setting an empty DataFrame (:issue:`6171`) +- Bug in selecting on a MultiIndex ``HDFStore`` even in the presence of under + specified column spec (:issue:`6169`) +- Bug in ``nanops.var`` with ``ddof=1`` and 1 elements would sometimes return ``inf`` + rather than ``nan`` on some platforms (:issue:`6136`) +- Bug in Series and DataFrame bar plots ignoring the ``use_index`` keyword (:issue:`6209`) +- Bug in groupby with mixed str/int under python3 fixed; ``argsort`` was failing (:issue:`6212`) + +.. _whatsnew_0.13.1.contributors: + +Contributors +~~~~~~~~~~~~ -See the :ref:`full release notes -` or issue tracker -on GitHub for a complete list of all API changes, Enhancements and Bug Fixes. +.. contributors:: v0.13.0..v0.13.1 diff --git a/doc/source/whatsnew/v0.14.0.txt b/doc/source/whatsnew/v0.14.0.rst similarity index 99% rename from doc/source/whatsnew/v0.14.0.txt rename to doc/source/whatsnew/v0.14.0.rst index d4b7b09c054d6..9606bbac2a1b3 100644 --- a/doc/source/whatsnew/v0.14.0.txt +++ b/doc/source/whatsnew/v0.14.0.rst @@ -3,6 +3,8 @@ v0.14.0 (May 31 , 2014) ----------------------- +{{ common_imports }} + This is a major release from 0.13.1 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -249,13 +251,13 @@ Display Changes constraints were reached and an ellipse (...) signaled that part of the data was cut off. - .. image:: _static/trunc_before.png + .. image:: ../_static/trunc_before.png :alt: The previous look of truncate. In the current version, large DataFrames are centrally truncated, showing a preview of head and tail in both dimensions. - .. image:: _static/trunc_after.png + .. image:: ../_static/trunc_after.png :alt: The new look. - allow option ``'truncate'`` for ``display.show_dimensions`` to only show the dimensions if the @@ -1047,3 +1049,11 @@ Bug Fixes - Bug in expressions evaluation with reversed ops, showing in series-dataframe ops (:issue:`7198`, :issue:`7192`) - Bug in multi-axis indexing with > 2 ndim and a MultiIndex (:issue:`7199`) - Fix a bug where invalid eval/query operations would blow the stack (:issue:`5198`) + + +.. _whatsnew_0.14.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.13.1..v0.14.0 diff --git a/doc/source/whatsnew/v0.14.1.txt b/doc/source/whatsnew/v0.14.1.rst similarity index 99% rename from doc/source/whatsnew/v0.14.1.txt rename to doc/source/whatsnew/v0.14.1.rst index d019cf54086c6..3b0ff5650d90d 100644 --- a/doc/source/whatsnew/v0.14.1.txt +++ b/doc/source/whatsnew/v0.14.1.rst @@ -3,6 +3,8 @@ v0.14.1 (July 11, 2014) ----------------------- +{{ common_imports }} + This is a minor release from 0.14.0 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -269,3 +271,11 @@ Bug Fixes - Bug in grouped `hist` doesn't handle `rot` kw and `sharex` kw properly (:issue:`7234`) - Bug in ``.loc`` performing fallback integer indexing with ``object`` dtype indices (:issue:`7496`) - Bug (regression) in ``PeriodIndex`` constructor when passed ``Series`` objects (:issue:`7701`). + + +.. _whatsnew_0.14.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.14.0..v0.14.1 diff --git a/doc/source/whatsnew/v0.15.0.txt b/doc/source/whatsnew/v0.15.0.rst similarity index 99% rename from doc/source/whatsnew/v0.15.0.txt rename to doc/source/whatsnew/v0.15.0.rst index 4be6975958af5..00eda927a9c73 100644 --- a/doc/source/whatsnew/v0.15.0.txt +++ b/doc/source/whatsnew/v0.15.0.rst @@ -3,6 +3,8 @@ v0.15.0 (October 18, 2014) -------------------------- +{{ common_imports }} + This is a major release from 0.14.1 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -1216,3 +1218,11 @@ Bug Fixes - Suppress FutureWarning generated by NumPy when comparing object arrays containing NaN for equality (:issue:`7065`) - Bug in ``DataFrame.eval()`` where the dtype of the ``not`` operator (``~``) was not correctly inferred as ``bool``. + + +.. _whatsnew_0.15.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.14.1..v0.15.0 diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.rst similarity index 98% rename from doc/source/whatsnew/v0.15.1.txt rename to doc/source/whatsnew/v0.15.1.rst index 8cbf239ea20d0..88127d4e1b8d8 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.rst @@ -3,6 +3,8 @@ v0.15.1 (November 9, 2014) -------------------------- +{{ common_imports }} + This is a minor bug-fix release from 0.15.0 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -304,3 +306,11 @@ Bug Fixes - Bug in Setting by indexer to a scalar value with a mixed-dtype `Panel4d` was failing (:issue:`8702`) - Bug where ``DataReader``'s would fail if one of the symbols passed was invalid. Now returns data for valid symbols and np.nan for invalid (:issue:`8494`) - Bug in ``get_quote_yahoo`` that wouldn't allow non-float return values (:issue:`5229`). + + +.. _whatsnew_0.15.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.15.0..v0.15.1 diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.rst similarity index 99% rename from doc/source/whatsnew/v0.15.2.txt rename to doc/source/whatsnew/v0.15.2.rst index ee72fab7d23f2..dd988cde88145 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.rst @@ -3,6 +3,8 @@ v0.15.2 (December 12, 2014) --------------------------- +{{ common_imports }} + This is a minor release from 0.15.1 and includes a large number of bug fixes along with several new features, enhancements, and performance improvements. A small number of API changes were necessary to fix existing bugs. @@ -238,3 +240,11 @@ Bug Fixes - Bug in plotting if sharex was enabled and index was a timeseries, would show labels on multiple axes (:issue:`3964`). - Bug where passing a unit to the TimedeltaIndex constructor applied the to nano-second conversion twice. (:issue:`9011`). - Bug in plotting of a period-like array (:issue:`9012`) + + +.. _whatsnew_0.15.2.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.15.1..v0.15.2 diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.rst similarity index 99% rename from doc/source/whatsnew/v0.16.0.txt rename to doc/source/whatsnew/v0.16.0.rst index ce525bbb4c1d6..d394b43a7ec88 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.rst @@ -3,6 +3,8 @@ v0.16.0 (March 22, 2015) ------------------------ +{{ common_imports }} + This is a major release from 0.15.2 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -74,7 +76,7 @@ calculate the ratio, and plot PetalRatio = lambda x: x.PetalWidth / x.PetalLength) .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) -.. image:: _static/whatsnew_assign.png +.. image:: ../_static/whatsnew_assign.png :scale: 50 % See the :ref:`documentation ` for more. (:issue:`9229`) @@ -675,3 +677,11 @@ Bug Fixes df1 = DataFrame({'x': Series(['a','b','c']), 'y': Series(['d','e','f'])}) df2 = df1[['x']] df2['y'] = ['g', 'h', 'i'] + + +.. _whatsnew_0.16.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.15.2..v0.16.0 diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.rst similarity index 99% rename from doc/source/whatsnew/v0.16.1.txt rename to doc/source/whatsnew/v0.16.1.rst index d3a8064a0e786..aae96a5d63c14 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.rst @@ -3,6 +3,8 @@ v0.16.1 (May 11, 2015) ---------------------- +{{ common_imports }} + This is a minor bug-fix release from 0.16.0 and includes a a large number of bug fixes along several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. @@ -465,3 +467,11 @@ Bug Fixes - Bug in subclassed ``DataFrame``. It may not return the correct class, when slicing or subsetting it. (:issue:`9632`) - Bug in ``.median()`` where non-float null values are not handled correctly (:issue:`10040`) - Bug in Series.fillna() where it raises if a numerically convertible string is given (:issue:`10092`) + + +.. _whatsnew_0.16.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.16.0..v0.16.1 diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.rst similarity index 98% rename from doc/source/whatsnew/v0.16.2.txt rename to doc/source/whatsnew/v0.16.2.rst index 047da4c94093b..acae3a55d5f78 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.rst @@ -3,6 +3,8 @@ v0.16.2 (June 12, 2015) ----------------------- +{{ common_imports }} + This is a minor bug-fix release from 0.16.1 and includes a a large number of bug fixes along some new features (:meth:`~DataFrame.pipe` method), enhancements, and performance improvements. @@ -165,3 +167,11 @@ Bug Fixes - Bug in ``read_hdf`` where open stores could not be used (:issue:`10330`). - Bug in adding empty ``DataFrames``, now results in a ``DataFrame`` that ``.equals`` an empty ``DataFrame`` (:issue:`10181`). - Bug in ``to_hdf`` and ``HDFStore`` which did not check that complib choices were valid (:issue:`4582`, :issue:`8874`). + + +.. _whatsnew_0.16.2.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.16.1..v0.16.2 diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.rst similarity index 99% rename from doc/source/whatsnew/v0.17.0.txt rename to doc/source/whatsnew/v0.17.0.rst index 404f2bf06e861..abde8d953f4df 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.rst @@ -3,6 +3,8 @@ v0.17.0 (October 9, 2015) ------------------------- +{{ common_imports }} + This is a major release from 0.16.2 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -160,7 +162,7 @@ To alleviate this issue, we have added a new, optional plotting interface, which In [14]: df.plot.bar() -.. image:: _static/whatsnew_plot_submethods.png +.. image:: ../_static/whatsnew_plot_submethods.png As a result of this change, these methods are now all discoverable via tab-completion: @@ -313,11 +315,11 @@ has been changed to make this keyword unnecessary - the change is shown below. **Old** -.. image:: _static/old-excel-index.png +.. image:: ../_static/old-excel-index.png **New** -.. image:: _static/new-excel-index.png +.. image:: ../_static/new-excel-index.png .. warning:: @@ -354,14 +356,14 @@ Some East Asian countries use Unicode characters its width is corresponding to 2 df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']}) df; -.. image:: _static/option_unicode01.png +.. image:: ../_static/option_unicode01.png .. ipython:: python pd.set_option('display.unicode.east_asian_width', True) df; -.. image:: _static/option_unicode02.png +.. image:: ../_static/option_unicode02.png For further details, see :ref:`here ` @@ -1167,3 +1169,11 @@ Bug Fixes - Bug in ``.groupby`` when number of keys to group by is same as length of index (:issue:`11185`) - Bug in ``convert_objects`` where converted values might not be returned if all null and ``coerce`` (:issue:`9589`) - Bug in ``convert_objects`` where ``copy`` keyword was not respected (:issue:`9589`) + + +.. _whatsnew_0.17.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.16.2..v0.17.0 diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.rst similarity index 98% rename from doc/source/whatsnew/v0.17.1.txt rename to doc/source/whatsnew/v0.17.1.rst index 328a8193c8b13..44554a88fba04 100644 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.rst @@ -3,6 +3,8 @@ v0.17.1 (November 21, 2015) --------------------------- +{{ common_imports }} + .. note:: We are proud to announce that *pandas* has become a sponsored project of the (`NumFOCUS organization`_). This will help ensure the success of development of *pandas* as a world-class open-source project. @@ -202,3 +204,11 @@ Bug Fixes - Bug in ``DataFrame.to_sparse()`` loses column names for MultiIndexes (:issue:`11600`) - Bug in ``DataFrame.round()`` with non-unique column index producing a Fatal Python error (:issue:`11611`) - Bug in ``DataFrame.round()`` with ``decimals`` being a non-unique indexed Series producing extra columns (:issue:`11618`) + + +.. _whatsnew_0.17.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.17.0..v0.17.1 diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.rst similarity index 99% rename from doc/source/whatsnew/v0.18.0.txt rename to doc/source/whatsnew/v0.18.0.rst index e38ba54d4b058..5cd4163b1a7a5 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.rst @@ -3,6 +3,8 @@ v0.18.0 (March 13, 2016) ------------------------ +{{ common_imports }} + This is a major release from 0.17.1 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -1290,3 +1292,11 @@ Bug Fixes - Bug when specifying a UTC ``DatetimeIndex`` by setting ``utc=True`` in ``.to_datetime`` (:issue:`11934`) - Bug when increasing the buffer size of CSV reader in ``read_csv`` (:issue:`12494`) - Bug when setting columns of a ``DataFrame`` with duplicate column names (:issue:`12344`) + + +.. _whatsnew_0.18.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.17.1..v0.18.0 diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.rst similarity index 99% rename from doc/source/whatsnew/v0.18.1.txt rename to doc/source/whatsnew/v0.18.1.rst index 34921505a46bf..1dc01d7f1f745 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.rst @@ -3,6 +3,8 @@ v0.18.1 (May 3, 2016) --------------------- +{{ common_imports }} + This is a minor bug-fix release from 0.18.0 and includes a large number of bug fixes along with several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. @@ -266,7 +268,7 @@ These changes conform sparse handling to return the correct types and work to ma ``SparseArray.take`` now returns a scalar for scalar input, ``SparseArray`` for others. Furthermore, it handles a negative indexer with the same rule as ``Index`` (:issue:`10560`, :issue:`12796`) -.. ipython:: python +.. code-block:: python s = pd.SparseArray([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) s.take(0) @@ -692,3 +694,11 @@ Bug Fixes - Bug in ``pd.to_numeric()`` with ``Index`` returns ``np.ndarray``, rather than ``Index`` (:issue:`12777`) - Bug in ``pd.to_numeric()`` with datetime-like may raise ``TypeError`` (:issue:`12777`) - Bug in ``pd.to_numeric()`` with scalar raises ``ValueError`` (:issue:`12777`) + + +.. _whatsnew_0.18.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.18.0..v0.18.1 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.rst similarity index 99% rename from doc/source/whatsnew/v0.19.0.txt rename to doc/source/whatsnew/v0.19.0.rst index 73fb124afef87..467319a4527d1 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.rst @@ -3,6 +3,8 @@ v0.19.0 (October 2, 2016) ------------------------- +{{ common_imports }} + This is a major release from 0.18.1 and includes number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -1564,3 +1566,11 @@ Bug Fixes - ``PeriodIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) - Bug in ``df.groupby`` where ``.median()`` returns arbitrary values if grouped dataframe contains empty bins (:issue:`13629`) - Bug in ``Index.copy()`` where ``name`` parameter was ignored (:issue:`14302`) + + +.. _whatsnew_0.19.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.18.1..v0.19.0 diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.rst similarity index 97% rename from doc/source/whatsnew/v0.19.1.txt rename to doc/source/whatsnew/v0.19.1.rst index 1c577dddf1cd4..0c909fa4195d7 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.rst @@ -3,6 +3,8 @@ v0.19.1 (November 3, 2016) -------------------------- +{{ common_imports }} + This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, bug fixes and performance improvements. We recommend that all users upgrade to this version. @@ -59,3 +61,11 @@ Bug Fixes - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) + + +.. _whatsnew_0.19.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.19.0..v0.19.1 diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.rst similarity index 97% rename from doc/source/whatsnew/v0.19.2.txt rename to doc/source/whatsnew/v0.19.2.rst index 171d97b76de75..1cded6d2c94e2 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.rst @@ -3,6 +3,8 @@ v0.19.2 (December 24, 2016) --------------------------- +{{ common_imports }} + This is a minor bug-fix release in the 0.19.x series and includes some small regression fixes, bug fixes and performance improvements. We recommend that all users upgrade to this version. @@ -80,3 +82,11 @@ Bug Fixes - Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) - Bug in ``.plot(kind='kde')`` which did not drop missing values to generate the KDE Plot, instead generating an empty plot. (:issue:`14821`) - Bug in ``unstack()`` if called with a list of column(s) as an argument, regardless of the dtypes of all columns, they get coerced to ``object`` (:issue:`11847`) + + +.. _whatsnew_0.19.2.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.19.1..v0.19.2 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.rst similarity index 99% rename from doc/source/whatsnew/v0.20.0.txt rename to doc/source/whatsnew/v0.20.0.rst index 9f5fbdc195f34..8456449ee4419 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.rst @@ -3,6 +3,8 @@ v0.20.1 (May 5, 2017) --------------------- +{{ common_imports }} + This is a major release from 0.19.2 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -381,7 +383,7 @@ For example, after running the following, ``styled.xlsx`` renders as below: highlight_max() styled.to_excel('styled.xlsx', engine='openpyxl') -.. image:: _static/style-excel.png +.. image:: ../_static/style-excel.png .. ipython:: python :suppress: @@ -1731,3 +1733,11 @@ Other - Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`) - Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`) - Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`) + + +.. _whatsnew_0.20.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.19.2..v0.20.0 diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.rst similarity index 97% rename from doc/source/whatsnew/v0.20.2.txt rename to doc/source/whatsnew/v0.20.2.rst index 3de6fbc8afaf8..784cd09edff30 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.rst @@ -3,6 +3,8 @@ v0.20.2 (June 4, 2017) ---------------------- +{{ common_imports }} + This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, bug fixes and performance improvements. We recommend that all users upgrade to this version. @@ -125,3 +127,11 @@ Other ^^^^^ - Bug in ``DataFrame.drop()`` with an empty-list with non-unique indices (:issue:`16270`) + + +.. _whatsnew_0.20.2.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.20.0..v0.20.2 diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.rst similarity index 95% rename from doc/source/whatsnew/v0.20.3.txt rename to doc/source/whatsnew/v0.20.3.rst index 582f975f81a7a..47bfcc761b088 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.rst @@ -3,6 +3,8 @@ v0.20.3 (July 7, 2017) ----------------------- +{{ common_imports }} + This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. @@ -58,3 +60,11 @@ Categorical ^^^^^^^^^^^ - Bug in ``DataFrame.sort_values`` not respecting the ``kind`` parameter with categorical data (:issue:`16793`) + + +.. _whatsnew_0.20.3.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.20.2..v0.20.3 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.rst similarity index 99% rename from doc/source/whatsnew/v0.21.0.txt rename to doc/source/whatsnew/v0.21.0.rst index 77ae5b92d0e70..c9a90f3ada7e5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.rst @@ -3,6 +3,8 @@ v0.21.0 (October 27, 2017) -------------------------- +{{ common_imports }} + This is a major release from 0.20.3 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -1176,3 +1178,11 @@ Other - Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) + + +.. _whatsnew_0.21.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.20.3..v0.21.0 diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.rst similarity index 98% rename from doc/source/whatsnew/v0.21.1.txt rename to doc/source/whatsnew/v0.21.1.rst index 49e59c9ddf5a7..bf13d5d67ed63 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.rst @@ -3,6 +3,8 @@ v0.21.1 (December 12, 2017) --------------------------- +{{ common_imports }} + This is a minor bug-fix release in the 0.21.x series and includes some small regression fixes, bug fixes and performance improvements. We recommend that all users upgrade to this version. @@ -169,3 +171,11 @@ String ^^^^^^ - :meth:`Series.str.split()` will now propagate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) + + +.. _whatsnew_0.21.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.21.0..v0.21.1 diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.rst similarity index 98% rename from doc/source/whatsnew/v0.22.0.txt rename to doc/source/whatsnew/v0.22.0.rst index d165339cb0de9..f05b84a9d8902 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.rst @@ -3,6 +3,8 @@ v0.22.0 (December 29, 2017) --------------------------- +{{ common_imports }} + This is a major release from 0.21.1 and includes a single, API-breaking change. We recommend that all users upgrade to this version after carefully reading the release note (singular!). @@ -241,3 +243,11 @@ With conda, use Note that the inconsistency in the return value for all-*NA* series is still there for pandas 0.20.3 and earlier. Avoiding pandas 0.21 will only help with the empty case. + + +.. _whatsnew_0.22.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.21.1..v0.22.0 diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.rst similarity index 99% rename from doc/source/whatsnew/v0.23.0.txt rename to doc/source/whatsnew/v0.23.0.rst index 473a4bb72e6d9..f84517a3e3b9c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.rst @@ -1,7 +1,9 @@ .. _whatsnew_0230: -v0.23.0 (May 15, 2018) ----------------------- +What's new in 0.23.0 (May 15, 2018) +----------------------------------- + +{{ common_imports }} This is a major release from 0.22.0 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along @@ -908,7 +910,7 @@ frames would not fit within the terminal width, and pandas would introduce line breaks to display these 20 columns. This resulted in an output that was relatively difficult to read: -.. image:: _static/print_df_old.png +.. image:: ../_static/print_df_old.png If Python runs in a terminal, the maximum number of columns is now determined automatically so that the printed data frame fits within the current terminal @@ -918,7 +920,7 @@ well as in many IDEs), this value cannot be inferred automatically and is thus set to `20` as in previous versions. In a terminal, this results in a much nicer output: -.. image:: _static/print_df_new.png +.. image:: ../_static/print_df_new.png Note that if you don't like the new default, you can always set this option yourself. To revert to the old setting, you can run this line: @@ -1412,3 +1414,10 @@ Other - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) - Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existent option key in some cases (:issue:`19789`) - Bug in :func:`testing.assert_series_equal` and :func:`testing.assert_frame_equal` for Series or DataFrames with differing unicode data (:issue:`20503`) + +.. _whatsnew_0.23.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.22.0..v0.23.0 diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.rst similarity index 97% rename from doc/source/whatsnew/v0.23.1.txt rename to doc/source/whatsnew/v0.23.1.rst index 1a514ba627fcb..e8e0060c48337 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.rst @@ -1,7 +1,9 @@ .. _whatsnew_0231: -v0.23.1 (June 12, 2018) ------------------------ +What's New in 0.23.1 (June 12, 2018) +------------------------------------ + +{{ common_imports }} This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. @@ -138,3 +140,10 @@ Bug Fixes - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) - Bug preventing pandas being used on Windows without C++ redistributable installed (:issue:`21106`) + +.. _whatsnew_0.23.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.23.0..v0.23.1 diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.rst similarity index 81% rename from doc/source/whatsnew/v0.23.2.txt rename to doc/source/whatsnew/v0.23.2.rst index 7ec6e2632e717..573a30f17846b 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.rst @@ -1,7 +1,9 @@ .. _whatsnew_0232: -v0.23.2 (July 5, 2018) ----------------------- +What's New in 0.23.2 (July 5, 2018) +----------------------------------- + +{{ common_imports }} This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. @@ -101,8 +103,20 @@ Bug Fixes **Timezones** - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) -- Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) +- Bug in comparing :class:`DataFrame` with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) **Timedelta** - Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) + +.. _whatsnew_0.23.2.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.23.1..v0.23.2 diff --git a/doc/source/whatsnew/v0.23.3.rst b/doc/source/whatsnew/v0.23.3.rst new file mode 100644 index 0000000000000..29758e54b437b --- /dev/null +++ b/doc/source/whatsnew/v0.23.3.rst @@ -0,0 +1,16 @@ +.. _whatsnew_0233: + +What's New in 0.23.3 (July 7, 2018) +----------------------------------- + +{{ common_imports }} + +This release fixes a build issue with the sdist for Python 3.7 (:issue:`21785`) +There are no other changes. + +.. _whatsnew_0.23.3.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.23.2..v0.23.3 diff --git a/doc/source/whatsnew/v0.23.3.txt b/doc/source/whatsnew/v0.23.3.txt deleted file mode 100644 index b8adce27d2523..0000000000000 --- a/doc/source/whatsnew/v0.23.3.txt +++ /dev/null @@ -1,7 +0,0 @@ -.. _whatsnew_0233: - -v0.23.3 (July 7, 2018) ----------------------- - -This release fixes a build issue with the sdist for Python 3.7 (:issue:`21785`) -There are no other changes. diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.rst similarity index 84% rename from doc/source/whatsnew/v0.23.4.txt rename to doc/source/whatsnew/v0.23.4.rst index 9a3ad3f61ee49..c8f08d0bb7091 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.rst @@ -1,7 +1,9 @@ .. _whatsnew_0234: -v0.23.4 (August 3, 2018) ------------------------- +What's New in 0.23.4 (August 3, 2018) +------------------------------------- + +{{ common_imports }} This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. @@ -35,3 +37,10 @@ Bug Fixes **Missing** - Bug in :func:`Series.clip` and :func:`DataFrame.clip` cannot accept list-like threshold containing ``NaN`` (:issue:`19992`) + +.. _whatsnew_0.23.4.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.23.3..v0.23.4 diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst new file mode 100644 index 0000000000000..4e12b22c8ccac --- /dev/null +++ b/doc/source/whatsnew/v0.24.0.rst @@ -0,0 +1,1600 @@ +.. _whatsnew_0240: + +What's New in 0.24.0 (Month XX, 2018) +------------------------------------- + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +{{ common_imports }} + +These are the changes in pandas 0.24.0. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups ` for more information (:issue:`15475`, :issue:`15506`). +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing + the user to override the engine's default behavior to include or omit the + dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) +- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`DataFrame.read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) + +.. _whatsnew_0240.values_api: + +Accessing the values in a Series or Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a +``Series`` or ``Index``. + +.. ipython:: python + + idx = pd.period_range('2000', periods=4) + idx.array + pd.Series(idx).array + +Historically, this would have been done with ``series.values``, but with +``.values`` it was unclear whether the returned value would be the actual array, +some transformation of it, or one of pandas custom arrays (like +``Categorical``). For example, with :class:`PeriodIndex`, ``.values`` generates +a new ndarray of period objects each time. + +.. ipython:: python + + id(idx.values) + id(idx.values) + +If you need an actual NumPy array, use :meth:`Series.to_numpy` or :meth:`Index.to_numpy`. + +.. ipython:: python + + idx.to_numpy() + pd.Series(idx).to_numpy() + +For Series and Indexes backed by normal NumPy arrays, this will be the same thing (and the same +as ``.values``). + +.. ipython:: python + + ser = pd.Series([1, 2, 3]) + ser.array + ser.to_numpy() + +We haven't removed or deprecated :attr:`Series.values` or :attr:`DataFrame.values`, but we +recommend and using ``.array`` or ``.to_numpy()`` instead. + +See :ref:`basics.dtypes` and :ref:`dsintro.attrs` for more. + +.. _whatsnew_0240.enhancements.extension_array_operators: + +``ExtensionArray`` operator support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison +operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: + +1. Define each of the operators on your ``ExtensionArray`` subclass. +2. Use an operator implementation from pandas that depends on operators that are already defined + on the underlying elements (scalars) of the ``ExtensionArray``. + +See the :ref:`ExtensionArray Operator Support +` documentation section for details on both +ways of adding operator support. + +.. _whatsnew_0240.enhancements.intna: + +Optional Integer NA Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +Here is an example of the usage. + +We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) + +.. ipython:: python + + s = pd.Series([1, 2, np.nan], dtype='Int64') + s + + +Operations on these dtypes will propagate ``NaN`` as other pandas operations. + +.. ipython:: python + + # arithmetic + s + 1 + + # comparison + s == 1 + + # indexing + s.iloc[1:3] + + # operate with other dtypes + s + s.iloc[1:3].astype('Int8') + + # coerce when needed + s + 0.01 + +These dtypes can operate as part of of ``DataFrame``. + +.. ipython:: python + + df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df + df.dtypes + + +These dtypes can be merged & reshaped & casted. + +.. ipython:: python + + pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes + df['A'].astype(float) + +Reduction and groupby operations such as 'sum' work. + +.. ipython:: python + + df.sum() + df.groupby('B').A.sum() + +.. warning:: + + The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. + +.. _whatsnew_0240.enhancements.read_html: + +``read_html`` Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as sequences of cells with the same +value. (:issue:`17054`) + +.. ipython:: python + + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") + +Previous Behavior: + +.. code-block:: ipython + + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. ipython:: python + + result + + +.. _whatsnew_0240.enhancements.interval: + +Storing Interval and Period Data in Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interval and Period data may now be stored in a ``Series`` or ``DataFrame``, in addition to an +:class:`IntervalIndex` and :class:`PeriodIndex` like previously (:issue:`19453`, :issue:`22862`). + +.. ipython:: python + + ser = pd.Series(pd.interval_range(0, 5)) + ser + ser.dtype + +And for periods: + +.. ipython:: python + + pser = pd.Series(pd.date_range("2000", freq="D", periods=5)) + pser + pser.dtype + +Previously, these would be cast to a NumPy array with object dtype. In general, +this should result in better performance when storing an array of intervals or periods +in a :class:`Series` or column of a :class:`DataFrame`. + +Note that the ``.values`` of a ``Series`` containing one of these types is no longer a NumPy +array, but rather an ``ExtensionArray``: + +.. ipython:: python + + ser.values + pser.values + +This is the same behavior as ``Series.values`` for categorical data. See +:ref:`whatsnew_0240.api_breaking.interval_values` for more. + + +.. _whatsnew_0240.enhancements.styler_pipe: + +New ``Styler.pipe()`` method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The :class:`~pandas.io.formats.style.Styler` class has gained a +:meth:`~pandas.io.formats.style.Styler.pipe` method (:issue:`23229`). This provides a +convenient way to apply users' predefined styling functions, and can help reduce +"boilerplate" when using DataFrame styling functionality repeatedly within a notebook. + +.. ipython:: python + + df = pandas.DataFrame({'N': [1250, 1500, 1750], 'X': [0.25, 0.35, 0.50]}) + + def format_and_align(styler): + return (styler.format({'N': '{:,}', 'X': '{:.1%}'}) + .set_properties(**{'text-align': 'right'})) + + df.style.pipe(format_and_align).set_caption('Summary of results.') + +Similar methods already exist for other classes in pandas, including :meth:`DataFrame.pipe`, +:meth:`Groupby.pipe`, and :meth:`Resampler.pipe`. + + +.. _whatsnew_0240.enhancements.join_with_two_multiindexes: + +Joining with two multi-indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`) + +See the :ref:`Merge, join, and concatenate +` documentation section. + +.. ipython:: python + + index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=index_left) + + + index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index_right) + + + left.join(right) + +For earlier versions this can be done using the following. + +.. ipython:: python + + pd.merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner').set_index(['key', 'X', 'Y']) + + +.. _whatsnew_0240.enhancements.rename_axis: + +Renaming names in a MultiIndex +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`DataFrame.rename_axis` now supports ``index`` and ``columns`` arguments +and :func:`Series.rename_axis` supports ``index`` argument (:issue:`19978`) + +This change allows a dictionary to be passed so that some of the names +of a ``MultiIndex`` can be changed. + +Example: + +.. ipython:: python + + mi = pd.MultiIndex.from_product([list('AB'), list('CD'), list('EF')], + names=['AB', 'CD', 'EF']) + df = pd.DataFrame([i for i in range(len(mi))], index=mi, columns=['N']) + df + df.rename_axis(index={'CD': 'New'}) + +See the :ref:`advanced docs on renaming` for more details. + + +.. _whatsnew_0240.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ + +- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) +- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) +- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) +- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) +- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) +- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) +- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to + reflect changes from the `Pandas-GBQ library version 0.8.0 + `__. + Adds a ``credentials`` argument, which enables the use of any kind of + `google-auth credentials + `__. (:issue:`21627`, + :issue:`22557`, :issue:`23662`) +- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) +- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) +- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) +- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). + The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). +- :meth:`DataFrame.to_sql` now supports writing ``TIMESTAMP WITH TIME ZONE`` types for supported databases. For databases that don't support timezones, datetime data will be stored as timezone unaware local timestamps. See the :ref:`io.sql_datetime_data` for implications (:issue:`9086`). +- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) +- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) +- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) +- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) +- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) +- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). +- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). +- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, + all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) +- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). +- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). +- Compatibility with Matplotlib 3.0 (:issue:`22790`). +- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) +- :func:`read_fwf` now accepts keyword `infer_nrows` (:issue:`15138`). +- :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) +- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`8917`) +- :meth:`Index.difference` now has an optional ``sort`` parameter to specify whether the results should be sorted if possible (:issue:`17839`) +- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) +- :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. +- :meth:`DataFrame.to_stata` and :class:` pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) +- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue: `8839`) +- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) + +.. _whatsnew_0240.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) +- :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) +- Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) +- ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) +- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) + +.. _whatsnew_0240.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`21242`, :issue:`18742`, :issue:`23774`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| numpy | 1.12.0 | X | ++-----------------+-----------------+----------+ +| bottleneck | 1.2.0 | | ++-----------------+-----------------+----------+ +| fastparquet | 0.1.2 | | ++-----------------+-----------------+----------+ +| matplotlib | 2.0.0 | | ++-----------------+-----------------+----------+ +| numexpr | 2.6.1 | | ++-----------------+-----------------+----------+ +| pandas-gbq | 0.8.0 | | ++-----------------+-----------------+----------+ +| pyarrow | 0.7.0 | | ++-----------------+-----------------+----------+ +| pytables | 3.4.2 | | ++-----------------+-----------------+----------+ +| scipy | 0.18.1 | | ++-----------------+-----------------+----------+ +| xlrd | 1.0.0 | | ++-----------------+-----------------+----------+ + +Additionally we no longer depend on `feather-format` for feather based storage +and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`). + +.. _whatsnew_0240.api_breaking.csv_line_terminator: + +`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'`` +for the default line terminator (:issue:`20353`). +This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator +even when ``'\n'`` was passed in ``line_terminator``. + +Previous Behavior on Windows: + +.. code-block:: ipython + + In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + + In [2]: # When passing file PATH to to_csv, line_terminator does not work, and csv is saved with '\r\n'. + ...: # Also, this converts all '\n's in the data to '\r\n'. + ...: data.to_csv("test.csv", index=False, line_terminator='\n') + + In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) + b'string_with_lf,string_with_crlf\r\n"a\r\nbc","a\r\r\nbc"\r\n' + + In [4]: # When passing file OBJECT with newline option to to_csv, line_terminator works. + ...: with open("test2.csv", mode='w', newline='\n') as f: + ...: data.to_csv(f, index=False, line_terminator='\n') + + In [5]: with open("test2.csv", mode='rb') as f: + ...: print(f.read()) + b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' + + +New Behavior on Windows: + +- By passing ``line_terminator`` explicitly, line terminator is set to that character. +- The value of ``line_terminator`` only affects the line terminator of CSV, + so it does not change the value inside the data. + + .. code-block:: ipython + + In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + + In [2]: data.to_csv("test.csv", index=False, line_terminator='\n') + + In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) + b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' + + +- On Windows, the value of ``os.linesep`` is ``'\r\n'``, + so if ``line_terminator`` is not set, ``'\r\n'`` is used for line terminator. +- Again, it does not affect the value inside the data. + + .. code-block:: ipython + + In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + + In [2]: data.to_csv("test.csv", index=False) + + In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) + b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' + + +- For files objects, specifying ``newline`` is not sufficient to set the line terminator. + You must pass in the ``line_terminator`` explicitly, even in this case. + + .. code-block:: ipython + + In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + + In [2]: with open("test2.csv", mode='w', newline='\n') as f: + ...: data.to_csv(f, index=False) + + In [3]: with open("test2.csv", mode='rb') as f: + ...: print(f.read()) + b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' + +.. _whatsnew_0240.api_breaking.interval_values: + +``IntervalIndex.values`` is now an ``IntervalArray`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: idx = pd.interval_range(0, 4) + + In [2]: idx.values + Out[2]: + array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), + Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], + dtype=object) + +New Behavior: + +.. ipython:: python + + idx = pd.interval_range(0, 4) + idx.values + +This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. + +For situations where you need an ``ndarray`` of ``Interval`` objects, use +:meth:`numpy.asarray`. + +.. ipython:: python + + np.asarray(idx) + idx.values.astype(object) + + +.. _whatsnew_0240.api.timezone_offset_parsing: + +Parsing Datetime Strings with Timezone Offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` +or :class:`DatetimeIndex` would automatically convert the datetime to UTC +without timezone localization. This is inconsistent from parsing the same +datetime string with :class:`Timestamp` which would preserve the UTC +offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC +offset in the ``tz`` attribute when all the datetime strings have the same +UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) + +*Previous Behavior*: + +.. code-block:: ipython + + In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") + Out[2]: Timestamp('2015-11-18 10:00:00') + + In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") + Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') + + # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) + In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) + +*Current Behavior*: + +.. ipython:: python + + pd.to_datetime("2015-11-18 15:30:00+05:30") + pd.Timestamp("2015-11-18 15:30:00+05:30") + +Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) + +Parsing datetime strings with different UTC offsets will now create an Index of +``datetime.datetime`` objects with different UTC offsets + +.. ipython:: python + + idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + idx + idx[0] + idx[1] + +Passing ``utc=True`` will mimic the previous behavior but will correctly indicate +that the dates have been converted to UTC + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) + +.. _whatsnew_0240.api_breaking.calendarday: + +CalendarDay Offset +^^^^^^^^^^^^^^^^^^ + +:class:`Day` and associated frequency alias ``'D'`` were documented to represent +a calendar day; however, arithmetic and operations with :class:`Day` sometimes +respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + + # Respects calendar arithmetic + In [3]: pd.date_range(start=ts, freq='D', periods=3) + Out[3]: + DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', + '2016-11-01 00:00:00+02:00'], + dtype='datetime64[ns, Europe/Helsinki]', freq='D') + + # Respects absolute arithmetic + In [4]: ts + pd.tseries.frequencies.to_offset('D') + Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') + +:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available +and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` +will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) +See the :ref:`documentation here ` for more information. + +Addition with :class:`CalendarDay` across a daylight savings time transition: + +.. ipython:: python + + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts + pd.offsets.Day(1) + ts + pd.offsets.CalendarDay(1) + +.. _whatsnew_0240.api_breaking.period_end_time: + +Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The time values in :class:`Period` and :class:`PeriodIndex` objects are now set +to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, +:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, +or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) + +Previous Behavior: + +.. code-block:: ipython + + In [2]: p = pd.Period('2017-01-01', 'D') + In [3]: pi = pd.PeriodIndex([p]) + + In [4]: pd.Series(pi).dt.end_time[0] + Out[4]: Timestamp(2017-01-01 00:00:00) + + In [5]: p.end_time + Out[5]: Timestamp(2017-01-01 23:59:59.999999999) + +Current Behavior: + +Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as +is the case with :attr:`Period.end_time`, for example + +.. ipython:: python + + p = pd.Period('2017-01-01', 'D') + pi = pd.PeriodIndex([p]) + + pd.Series(pi).dt.end_time[0] + + p.end_time + +.. _whatsnew_0240.api_breaking.sparse_values: + +Sparse Data Structure Refactor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, +is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). +To conform to this interface and for consistency with the rest of pandas, some API breaking +changes were made: + +- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. +- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. +- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) +- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): + + * The default value of ``allow_fill`` has changed from ``False`` to ``True``. + * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). + * Passing a scalar for ``indices`` is no longer allowed. + +- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. +- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. +- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. +- ``DataFrame[column]`` is now a :class:`Series` with sparse values, rather than a :class:`SparseSeries`, when slicing a single column with sparse values (:issue:`23559`). + +Some new warnings are issued for operations that require or are likely to materialize a large dense array: + +- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. +- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. + +In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. + +Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`. + +.. ipython:: python + + s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]') + s.sparse.density + +.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: + +Raise ValueError in ``DataFrame.to_dict(orient='index')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with +``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) + df + + df.to_dict(orient='index') + +.. _whatsnew_0240.api.datetimelike.normalize: + +Tick DateOffset Normalize Restrictions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, +:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with +``normalize=True`` is no longer supported. This prevents unexpected behavior +where addition could fail to be monotone or associative. (:issue:`21427`) + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') + + In [3]: ts + Out[3]: Timestamp('2018-06-11 18:01:14') + + In [4]: tic = pd.offsets.Hour(n=2, normalize=True) + ...: + + In [5]: tic + Out[5]: <2 * Hours> + + In [6]: ts + tic + Out[6]: Timestamp('2018-06-11 00:00:00') + + In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) + Out[7]: False + +*Current Behavior*: + +.. ipython:: python + + ts = pd.Timestamp('2018-06-11 18:01:14') + tic = pd.offsets.Hour(n=2) + ts + tic + tic + tic == ts + (tic + tic + tic) + + +.. _whatsnew_0240.api.datetimelike: + + +.. _whatsnew_0240.api.period_subtraction: + +Period Subtraction +^^^^^^^^^^^^^^^^^^ + +Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. +instead of an integer (:issue:`21314`) + +.. ipython:: python + + june = pd.Period('June 2018') + april = pd.Period('April 2018') + june - april + +Previous Behavior: + +.. code-block:: ipython + + In [2]: june = pd.Period('June 2018') + + In [3]: april = pd.Period('April 2018') + + In [4]: june - april + Out [4]: 2 + +Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return +an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` + +.. ipython:: python + + pi = pd.period_range('June 2018', freq='M', periods=3) + pi - pi[0] + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) + + In [3]: pi - pi[0] + Out[3]: Int64Index([0, 1, 2], dtype='int64') + + +.. _whatsnew_0240.api.timedelta64_subtract_nan: + +Addition/Subtraction of ``NaN`` from :class:`DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Adding or subtracting ``NaN`` from a :class:`DataFrame` column with +``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning +all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and +``Series`` behavior (:issue:`22163`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame([pd.Timedelta(days=1)]) + df - np.nan + +Previous Behavior: + +.. code-block:: ipython + + In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) + + In [5]: df - np.nan + Out[5]: + 0 + 0 NaT + +.. _whatsnew_0240.api.dataframe_cmp_broadcasting: + +DataFrame Comparison Operations Broadcasting Changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Previously, the broadcasting behavior of :class:`DataFrame` comparison +operations (``==``, ``!=``, ...) was inconsistent with the behavior of +arithmetic operations (``+``, ``-``, ...). The behavior of the comparison +operations has been changed to match the arithmetic operations in these cases. +(:issue:`22880`) + +The affected cases are: + +- operating against a 2-dimensional ``np.ndarray`` with either 1 row or 1 column will now broadcast the same way a ``np.ndarray`` would (:issue:`23000`). +- a list or tuple with length matching the number of rows in the :class:`DataFrame` will now raise ``ValueError`` instead of operating column-by-column (:issue:`22880`. +- a list or tuple with length matching the number of columns in the :class:`DataFrame` will now operate row-by-row instead of raising ``ValueError`` (:issue:`22880`). + +Previous Behavior: + +.. code-block:: ipython + + In [3]: arr = np.arange(6).reshape(3, 2) + In [4]: df = pd.DataFrame(arr) + + In [5]: df == arr[[0], :] + ...: # comparison previously broadcast where arithmetic would raise + Out[5]: + 0 1 + 0 True True + 1 False False + 2 False False + In [6]: df + arr[[0], :] + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) + + In [7]: df == (1, 2) + ...: # length matches number of columns; + ...: # comparison previously raised where arithmetic would broadcast + ... + ValueError: Invalid broadcasting comparison [(1, 2)] with block values + In [8]: df + (1, 2) + Out[8]: + 0 1 + 0 1 3 + 1 3 5 + 2 5 7 + + In [9]: df == (1, 2, 3) + ...: # length matches number of rows + ...: # comparison previously broadcast where arithmetic would raise + Out[9]: + 0 1 + 0 False True + 1 True False + 2 False False + In [10]: df + (1, 2, 3) + ... + ValueError: Unable to coerce to Series, length must be 2: given 3 + +*Current Behavior*: + +.. ipython:: python + :okexcept: + + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr) + +.. ipython:: python + + # Comparison operations and arithmetic operations both broadcast. + df == arr[[0], :] + df + arr[[0], :] + +.. ipython:: python + + # Comparison operations and arithmetic operations both broadcast. + df == (1, 2) + df + (1, 2) + +.. ipython:: python + :okexcept: + + # Comparison operations and arithmetic opeartions both raise ValueError. + df == (1, 2, 3) + df + (1, 2, 3) + + +.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: + +DataFrame Arithmetic Operations Broadcasting Changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`DataFrame` arithmetic operations when operating with 2-dimensional +``np.ndarray`` objects now broadcast in the same way as ``np.ndarray`` +broadcast. (:issue:`23000`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: arr = np.arange(6).reshape(3, 2) + In [4]: df = pd.DataFrame(arr) + In [5]: df + arr[[0], :] # 1 row, 2 columns + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) + In [6]: df + arr[:, [1]] # 1 column, 3 rows + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) + +*Current Behavior*: + +.. ipython:: python + + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr) + df + +.. ipython:: python + + df + arr[[0], :] # 1 row, 2 columns + df + arr[:, [1]] # 1 column, 3 rows + + +.. _whatsnew_0240.api.extension: + +ExtensionType Changes +^^^^^^^^^^^^^^^^^^^^^ + +**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** + +Pandas now requires that extension dtypes be hashable. The base class implements +a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should +update the ``ExtensionDtype._metadata`` tuple to match the signature of your +``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). + +**Other changes** + +- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) +- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore + the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) +- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). +- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) +- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) +- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) +- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) +- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) +- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) +- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). +- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) +- :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). +- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). +- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). +- A default repr for :class:`ExtensionArray` is now provided (:issue:`23601`). + +.. _whatsnew_0240.api.incompatibilities: + +Series and Index Data-Dtype Incompatibilities +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Series`` and ``Index`` constructors now raise when the +data is incompatible with a passed ``dtype=`` (:issue:`15832`) + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + 0 18446744073709551615 + dtype: uint64 + +Current Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + ... + OverflowError: Trying to coerce negative values to unsigned integers + +.. _whatsnew_0240.api.crosstab_dtypes: + +Crosstab Preserves Dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`crosstab` will preserve now dtypes in some cases that previously would +cast from integer dtype to floating dtype (:issue:`22019`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + Out[4]: + b 3 4 + a + 1 0.5 0.0 + 2 0.5 1.0 + +Current Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + +Datetimelike API Changes +^^^^^^^^^^^^^^^^^^^^^^^^ + +- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) +- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) +- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) +- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) +- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) + +.. _whatsnew_0240.api.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) +- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in + :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of + a ``KeyError`` (:issue:`21678`). +- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) +- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) +- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) +- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) +- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) +- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, + has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) +- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) +- Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`) +- :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`). +- :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`). +- :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`). +- The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`) +- :meth:`CategoricalIndex.reindex` now raises a ``ValueError`` if the target index is non-unique and not equal to the current index. It previously only raised if the target index was not of a categorical dtype (:issue:`23963`). + +.. _whatsnew_0240.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- :attr:`MultiIndex.labels` has been deprecated and replaced by :attr:`MultiIndex.codes`. + The functionality is unchanged. The new name better reflects the natures of + these codes and makes the ``MultiIndex`` API more similar to the API for :class:`CategoricalIndex`(:issue:`13443`). + As a consequence, other uses of the name ``labels`` in ``MultiIndex`` have also been deprecated and replaced with ``codes``: + - You should initialize a ``MultiIndex`` instance using a parameter named ``codes`` rather than ``labels``. + - ``MultiIndex.set_labels`` has been deprecated in favor of :meth:`MultiIndex.set_codes`. + - For method :meth:`MultiIndex.copy`, the ``labels`` parameter has been deprecated and replaced by a ``codes`` parameter. +- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) +- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) +- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) +- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) +- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) +- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) +- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) +- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain + many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) +- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) +- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) +- The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). +- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`) +- The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) +- The methods :meth:`DataFrame.update` and :meth:`Panel.update` have deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`) +- The methods :meth:`Series.str.partition` and :meth:`Series.str.rpartition` have deprecated the ``pat`` keyword in favor of ``sep`` (:issue:`22676`) +- Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of + `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) +- :meth:`ExtensionArray._formatting_values` is deprecated. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) +- Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`) +- Constructing a :class:`DatetimeIndex` from data with ``timedelta64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23675`) +- The ``keep_tz=False`` option (the default) of the ``keep_tz`` keyword of + :meth:`DatetimeIndex.to_series` is deprecated (:issue:`17832`). +- Timezone converting a tz-aware ``datetime.datetime`` or :class:`Timestamp` with :class:`Timestamp` and the ``tz`` argument is now deprecated. Instead, use :meth:`Timestamp.tz_convert` (:issue:`23579`) +- :func:`pandas.types.is_period` is deprecated in favor of `pandas.types.is_period_dtype` (:issue:`23917`) +- :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`) +- Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`) +- Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). + +.. _whatsnew_0240.deprecations.datetimelike_int_ops: + +Integer Addition/Subtraction with Datetime-like Classes Is Deprecated +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In the past, users could add or subtract integers or integer-dtypes arrays +from :class:`Period`, :class:`PeriodIndex`, and in some cases +:class:`Timestamp`, :class:`DatetimeIndex` and :class:`TimedeltaIndex`. + +This usage is now deprecated. Instead add or subtract integer multiples of +the object's ``freq`` attribute (:issue:`21939`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: per = pd.Period('2016Q1') + In [4]: per + 3 + Out[4]: Period('2016Q4', 'Q-DEC') + + In [5]: ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) + In [6]: ts + 2 + Out[6]: Timestamp('1994-05-06 14:15:16', freq='H') + + In [7]: tdi = pd.timedelta_range('1D', periods=2) + In [8]: tdi - np.array([2, 1]) + Out[8]: TimedeltaIndex(['-1 days', '1 days'], dtype='timedelta64[ns]', freq=None) + + In [9]: dti = pd.date_range('2001-01-01', periods=2, freq='7D') + In [10]: dti + pd.Index([1, 2]) + Out[10]: DatetimeIndex(['2001-01-08', '2001-01-22'], dtype='datetime64[ns]', freq=None) + +Current Behavior: + +.. ipython:: python + :okwarning: + + per = pd.Period('2016Q1') + per + 3 + + per = pd.Period('2016Q1') + per + 3 * per.freq + + ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) + ts + 2 * ts.freq + + tdi = pd.timedelta_range('1D', periods=2) + tdi - np.array([2 * tdi.freq, 1 * tdi.freq]) + + dti = pd.date_range('2001-01-01', periods=2, freq='7D') + dti + pd.Index([1 * dti.freq, 2 * dti.freq]) + +.. _whatsnew_0240.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) +- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) +- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) +- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) +- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) +- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) +- The ``Series`` constructor and ``.astype`` method will now raise a ``ValueError`` if timestamp dtypes are passed in without a unit (e.g. ``np.datetime64``) for the ``dtype`` parameter (:issue:`15987`) +- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) +- The modules ``pandas.types``, ``pandas.computation``, and ``pandas.util.decorators`` have been removed (:issue:`16157`, :issue:`16250`) +- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) +- :func:`pandas.pnow`, :func:`pandas.match`, :func:`pandas.groupby`, :func:`pd.get_store`, ``pd.Expr``, and ``pd.Term`` have been removed (:issue:`15538`, :issue:`15940`) +- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) +- ``pandas.parser``, ``pandas.lib``, and ``pandas.tslib`` have been removed (:issue:`15537`) +- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) +- :meth:`DataFrame.consolidate` and :meth:`Series.consolidate` have been removed (:issue:`15501`) +- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) +- The module ``pandas.tools`` has been removed (:issue:`15358`, :issue:`16005`) +- :meth:`SparseArray.get_values` and :meth:`SparseArray.to_dense` have dropped the ``fill`` parameter (:issue:`14686`) +- :meth:`DataFrame.sortlevel` and :meth:`Series.sortlevel` have been removed (:issue:`15099`) +- :meth:`SparseSeries.to_dense` has dropped the ``sparse_only`` parameter (:issue:`14686`) +- :meth:`DataFrame.astype` and :meth:`Series.astype` have renamed the ``raise_on_error`` argument to ``errors`` (:issue:`14967`) +- ``is_sequence``, ``is_any_int_dtype``, and ``is_floating_dtype`` have been removed from ``pandas.api.types`` (:issue:`16163`, :issue:`16189`) + +.. _whatsnew_0240.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Slicing Series and DataFrames with an monotonically increasing :class:`CategoricalIndex` + is now very fast and has speed comparable to slicing with an ``Int64Index``. + The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`) + Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``) + shows similar speed improvements as above (:issue:`21659`) +- Improved performance of :meth:`CategoricalIndex.equals` when comparing to another :class:`CategoricalIndex` (:issue:`24023`) +- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) +- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) +- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` + (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` + is likewise much faster (:issue:`21369`, :issue:`21508`) +- Improved performance of :meth:`HDFStore.groups` (and dependent functions like + :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) + (:issue:`21372`) +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) +- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) +- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) +- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) +- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) +- Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) +- Fixed a performance regression on Windows with Python 3.7 of :func:`pd.read_csv` (:issue:`23516`) +- Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`) + +.. _whatsnew_0240.docs: + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) +- +- + +.. _whatsnew_0240.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. +- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). +- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) +- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). +- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). +- In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) +- Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) +- Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) + +Datetimelike +^^^^^^^^^^^^ + +- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) +- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) +- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) +- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) +- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) +- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) +- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) +- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) +- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) +- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) +- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) +- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) +- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) +- Bug in the :class:`Series` repr with period-dtype data missing a space before the data (:issue:`23601`) +- Bug in :func:`date_range` when decrementing a start date to a past end date by a negative frequency (:issue:`23270`) +- Bug in :meth:`Series.min` which would return ``NaN`` instead of ``NaT`` when called on a series of ``NaT`` (:issue:`23282`) +- Bug in :func:`DataFrame.combine` with datetimelike values raising a TypeError (:issue:`23079`) +- Bug in :func:`date_range` with frequency of ``Day`` or higher where dates sufficiently far in the future could wrap around to the past instead of raising ``OutOfBoundsDatetime`` (:issue:`14187`) +- Bug in :class:`PeriodIndex` with attribute ``freq.n`` greater than 1 where adding a :class:`DateOffset` object would return incorrect results (:issue:`23215`) +- Bug in :class:`Series` that interpreted string indices as lists of characters when setting datetimelike values (:issue:`23451`) +- Bug in :class:`Timestamp` constructor which would drop the frequency of an input :class:`Timestamp` (:issue:`22311`) +- Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) +- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) +- Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) + +Timedelta +^^^^^^^^^ +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) +- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) +- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) +- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) +- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) +- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- Bug in :class:`TimedeltaIndex` where adding a timezone-aware datetime scalar incorrectly returned a timezone-naive :class:`DatetimeIndex` (:issue:`23215`) +- Bug in :class:`TimedeltaIndex` where adding ``np.timedelta64('NaT')`` incorrectly returned an all-`NaT` :class:`DatetimeIndex` instead of an all-`NaT` :class:`TimedeltaIndex` (:issue:`23215`) +- Bug in :class:`Timedelta` and :func:`to_timedelta()` have inconsistencies in supported unit string (:issue:`21762`) +- Bug in :class:`TimedeltaIndex` division where dividing by another :class:`TimedeltaIndex` raised ``TypeError`` instead of returning a :class:`Float64Index` (:issue:`23829`, :issue:`22631`) +- Bug in :class:`TimedeltaIndex` comparison operations where comparing against non-``Timedelta``-like objects would raise ``TypeError`` instead of returning all-``False`` for ``__eq__`` and all-``True`` for ``__ne__`` (:issue:`24056`) + +Timezones +^^^^^^^^^ + +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) +- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) +- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) +- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) +- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) +- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) +- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) +- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) +- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) +- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) +- Bug in :func:`DataFrame.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) +- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp` constructed with the ``replace`` method across DST (:issue:`18785`) +- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) +- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) +- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) +- Bug in :meth:`DataFrame.drop` and :meth:`Series.drop` when specifying a tz-aware Timestamp key to drop from a :class:`DatetimeIndex` with a DST transition (:issue:`21761`) +- Bug in :class:`DatetimeIndex` constructor where :class:`NaT` and ``dateutil.tz.tzlocal`` would raise an ``OutOfBoundsDatetime`` error (:issue:`23807`) +- Bug in :meth:`DatetimeIndex.tz_localize` and :meth:`Timestamp.tz_localize` with ``dateutil.tz.tzlocal`` near a DST transition that would return an incorrectly localized datetime (:issue:`23807`) +- Bug in :class:`Timestamp` constructor where a ``dateutil.tz.tzutc`` timezone passed with a ``datetime.datetime`` argument would be converted to a ``pytz.UTC`` timezone (:issue:`23807`) + +Offsets +^^^^^^^ + +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) +- Bug in adding :class:`DateOffset` with :class:`DataFrame` or :class:`PeriodIndex` incorrectly raising ``TypeError`` (:issue:`23215`) +- Bug in comparing :class:`DateOffset` objects with non-DateOffset objects, particularly strings, raising ``ValueError`` instead of returning ``False`` for equality checks and ``True`` for not-equal checks (:issue:`23524`) + +Numeric +^^^^^^^ + +- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) +- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) +- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) +- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, + when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), + a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). +- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) +- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) +- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) +- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) +- Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). +- :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) +- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) +- Calls such as :meth:`DataFrame.round` with a non-unique :meth:`CategoricalIndex` now return expected data. Previously, data would be improperly duplicated (:issue:`21809`). + +Strings +^^^^^^^ + +- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). +- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`). +- Bug :func:`Series.str.contains` not respecting the ``na`` argument for a ``Categorical`` dtype ``Series`` (:issue:`22158`) +- Bug in :meth:`Index.str.cat` when the result contained only ``NaN`` (:issue:`24044`) + +Interval +^^^^^^^^ + +- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) +- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) +- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) +- Bug in ``IntervalTree`` where data containing ``NaN`` triggered a warning and resulted in incorrect indexing queries with :class:`IntervalIndex` (:issue:`23352`) + +Indexing +^^^^^^^^ + +- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) +- :class:`PeriodIndex` now emits a ``KeyError`` when a malformed string is looked up, which is consistent with the behavior of :class:`DateTimeIndex` (:issue:`22803`) +- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :meth:`Series.reindex` when reindexing an empty series with a ``datetime64[ns, tz]`` dtype (:issue:`20869`) +- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) +- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) +- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) +- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) +- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) +- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) +- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) +- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) +- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) +- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) +- Bug in `MultiIndex.set_levels` when levels value is not subscriptable (:issue:`23273`) +- Bug where setting a timedelta column by ``Index`` causes it to be casted to double, and therefore lose precision (:issue:`23511`) +- Bug in :func:`Index.union` and :func:`Index.intersection` where name of the ``Index`` of the result was not computed correctly for certain cases (:issue:`9943`, :issue:`9862`) +- Bug in :class:`Index` slicing with boolean :class:`Index` may raise ``TypeError`` (:issue:`22533`) +- Bug in ``PeriodArray.__setitem__`` when accepting slice and list-like value (:issue:`23978`) + +Missing +^^^^^^^ + +- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) +- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) +- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) + + +MultiIndex +^^^^^^^^^^ + +- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) +- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a ``Series`` or ``DataFrame`` with a :class:`MultiIndex` index) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) +- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) + +I/O +^^^ + + +.. _whatsnew_0240.bug_fixes.nan_with_str_dtype: + +Proper handling of `np.NaN` in a string data-typed column with the Python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There was bug in :func:`read_excel` and :func:`read_csv` with the Python +engine, where missing values turned to ``'nan'`` with ``dtype=str`` and +``na_filter=True``. Now, these missing values are converted to the string +missing indicator, ``np.nan``. (:issue `20377`) + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +Previous Behavior: + +.. code-block:: ipython + + In [5]: data = 'a,b,c\n1,,3\n4,5,6' + In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) + In [7]: df.loc[0, 'b'] + Out[7]: + 'nan' + +Current Behavior: + +.. ipython:: python + + data = 'a,b,c\n1,,3\n4,5,6' + df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) + df.loc[0, 'b'] + +Notice how we now instead output ``np.nan`` itself instead of a stringified form of it. + +- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`) +- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`) +- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`) +- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) +- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) +- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) +- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`) +- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`) +- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) +- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) +- Bug in :func:`to_html()` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`) +- Bug in :func:`DataFrame.to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) +- Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) +- Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). +- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) +- Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`) +- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`) +- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) +- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) +- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) +- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) +- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) +- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) +- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) +- Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) +- :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) + +Plotting +^^^^^^^^ + +- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) +- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) +- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a + ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). +- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a + datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) +- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). +- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). +- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). +- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) +- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) +- Bug in :meth:`DataFrame.expanding` in which the ``axis`` argument was not being respected during aggregations (:issue:`23372`) +- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`). +- Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`) +- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). + +Reshaping +^^^^^^^^^ + +- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) +- Bug in :func:`pandas.concat` when joining only `Series` the `names` argument of `concat` is no longer ignored (:issue:`23490`) +- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) +- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) +- Bug in :meth:`DataFrame.where` with an empty DataFrame and empty ``cond`` having non-bool dtype (:issue:`21947`) +- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) +- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) +- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) +- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) +- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) +- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) +- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) +- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue:`22796`) +- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) +- Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have a :class:`MultiIndex` for columns (:issue:`23033`). +- Bug in :func:`pandas.melt` when passing column names that are not present in ``DataFrame`` (:issue:`23575`) +- Bug in :meth:`DataFrame.append` with a :class:`Series` with a dateutil timezone would raise a ``TypeError`` (:issue:`23682`) +- Bug in ``Series`` construction when passing no data and ``dtype=str`` (:issue:`22477`) +- Bug in :func:`cut` with ``bins`` as an overlapping ``IntervalIndex`` where multiple bins were returned per item instead of raising a ``ValueError`` (:issue:`23980`) +- Bug in :func:`pandas.concat` when joining ``Series`` datetimetz with ``Series`` category would lose timezone (:issue:`23816`) +- Bug in :meth:`DataFrame.join` when joining on partial MultiIndex would drop names (:issue:`20452`). + +.. _whatsnew_0240.bug_fixes.sparse: + +Sparse +^^^^^^ + +- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) +- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) +- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. +- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. +- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. +- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) +- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) +- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`) +- Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`) +- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) + +Build Changes +^^^^^^^^^^^^^ + +- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) +- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) +- + +Other +^^^^^ + +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) +- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) +- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) +- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. +- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) +- Checking PEP 3141 numbers in :func:`~pandas.api.types.is_scalar` function returns ``True`` (:issue:`22903`) +- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) + +.. _whatsnew_0.24.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.23.4..HEAD diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt deleted file mode 100644 index a1a0857fe6365..0000000000000 --- a/doc/source/whatsnew/v0.24.0.txt +++ /dev/null @@ -1,902 +0,0 @@ -.. _whatsnew_0240: - -v0.24.0 (Month XX, 2018) ------------------------- - -.. warning:: - - Starting January 1, 2019, pandas feature releases will support Python 3 only. - See :ref:`install.dropping-27` for more. - -.. _whatsnew_0240.enhancements: - -New features -~~~~~~~~~~~~ -- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) - - -- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) - -- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing -the user to override the engine's default behavior to include or omit the -dataframe's indexes from the resulting Parquet file. (:issue:`20768`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - - -.. _whatsnew_0240.enhancements.extension_array_operators: - -``ExtensionArray`` operator support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison -operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: - -1. Define each of the operators on your ``ExtensionArray`` subclass. -2. Use an operator implementation from pandas that depends on operators that are already defined - on the underlying elements (scalars) of the ``ExtensionArray``. - -See the :ref:`ExtensionArray Operator Support -` documentation section for details on both -ways of adding operator support. - -.. _whatsnew_0240.enhancements.intna: - -Optional Integer NA Support -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. -Here is an example of the usage. - -We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value -marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`) - -.. ipython:: python - - s = pd.Series([1, 2, np.nan], dtype='Int64') - s - - -Operations on these dtypes will propagate ``NaN`` as other pandas operations. - -.. ipython:: python - - # arithmetic - s + 1 - - # comparison - s == 1 - - # indexing - s.iloc[1:3] - - # operate with other dtypes - s + s.iloc[1:3].astype('Int8') - - # coerce when needed - s + 0.01 - -These dtypes can operate as part of of ``DataFrame``. - -.. ipython:: python - - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) - df - df.dtypes - - -These dtypes can be merged & reshaped & casted. - -.. ipython:: python - - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) - -.. warning:: - - The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. - -.. _whatsnew_0240.enhancements.read_html: - -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. -Now it understands them, treating them as sequences of cells with the same -value. (:issue:`17054`) - -.. ipython:: python - - result = pd.read_html(""" - - - - - - - - - - - -
ABC
12
""") - -Previous Behavior: - -.. code-block:: ipython - - In [13]: result - Out [13]: - [ A B C - 0 1 2 NaN] - -Current Behavior: - -.. ipython:: python - - result - - -.. _whatsnew_0240.enhancements.interval: - -Storing Interval Data in Series and DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an -:class:`IntervalIndex` like previously (:issue:`19453`). - -.. ipython:: python - - ser = pd.Series(pd.interval_range(0, 5)) - ser - ser.dtype - -Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, -this should result in better performance when storing an array of intervals in -a :class:`Series`. - -Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy -array, but rather an ``ExtensionArray``: - -.. ipython:: python - - ser.values - -This is the same behavior as ``Series.values`` for categorical data. See -:ref:`whatsnew_0240.api_breaking.interval_values` for more. - - -.. _whatsnew_0240.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ -- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) -- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) -- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) -- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) -- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`) -- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.6.0 - `__. - (:issue:`21627`, :issue:`22557`) -- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) -- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) -- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) -- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) -- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). - The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). -- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) -- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) -- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) -- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) -- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). -- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). -- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). -- Compatibility with Matplotlib 3.0 (:issue:`22790`). - -.. _whatsnew_0240.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - - -.. _whatsnew_0240.api_breaking.interval_values: - -``IntervalIndex.values`` is now an ``IntervalArray`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an -``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). - -Previous Behavior: - -.. code-block:: ipython - - In [1]: idx = pd.interval_range(0, 4) - - In [2]: idx.values - Out[2]: - array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), - Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], - dtype=object) - -New Behavior: - -.. ipython:: python - - idx = pd.interval_range(0, 4) - idx.values - -This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. - -For situations where you need an ``ndarray`` of ``Interval`` objects, use -:meth:`numpy.asarray` or ``idx.astype(object)``. - -.. ipython:: python - - np.asarray(idx) - idx.values.astype(object) - -.. _whatsnew_0240.api.timezone_offset_parsing: - -Parsing Datetime Strings with Timezone Offsets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` -or :class:`DatetimeIndex` would automatically convert the datetime to UTC -without timezone localization. This is inconsistent from parsing the same -datetime string with :class:`Timestamp` which would preserve the UTC -offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC -offset in the ``tz`` attribute when all the datetime strings have the same -UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) - -*Previous Behavior*: - -.. code-block:: ipython - - In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") - Out[2]: Timestamp('2015-11-18 10:00:00') - - In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") - Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') - - # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) - In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) - -*Current Behavior*: - -.. ipython:: python - - pd.to_datetime("2015-11-18 15:30:00+05:30") - pd.Timestamp("2015-11-18 15:30:00+05:30") - -Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) - -Parsing datetime strings with different UTC offsets will now create an Index of -``datetime.datetime`` objects with different UTC offsets - -.. ipython:: python - - idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - idx - idx[0] - idx[1] - -Passing ``utc=True`` will mimic the previous behavior but will correctly indicate -that the dates have been converted to UTC - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) - -.. _whatsnew_0240.api_breaking.calendarday: - -CalendarDay Offset -^^^^^^^^^^^^^^^^^^ - -:class:`Day` and associated frequency alias ``'D'`` were documented to represent -a calendar day; however, arithmetic and operations with :class:`Day` sometimes -respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - - # Respects calendar arithmetic - In [3]: pd.date_range(start=ts, freq='D', periods=3) - Out[3]: - DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', - '2016-11-01 00:00:00+02:00'], - dtype='datetime64[ns, Europe/Helsinki]', freq='D') - - # Respects absolute arithmetic - In [4]: ts + pd.tseries.frequencies.to_offset('D') - Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') - -:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available -and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` -will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) -See the :ref:`documentation here ` for more information. - -Addition with :class:`CalendarDay` across a daylight savings time transition: - -.. ipython:: python - - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - ts + pd.offsets.Day(1) - ts + pd.offsets.CalendarDay(1) - -.. _whatsnew_0240.api_breaking.period_end_time: - -Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The time values in :class:`Period` and :class:`PeriodIndex` objects are now set -to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, -:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, -or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) - -Previous Behavior: - -.. code-block:: ipython - - In [2]: p = pd.Period('2017-01-01', 'D') - In [3]: pi = pd.PeriodIndex([p]) - - In [4]: pd.Series(pi).dt.end_time[0] - Out[4]: Timestamp(2017-01-01 00:00:00) - - In [5]: p.end_time - Out[5]: Timestamp(2017-01-01 23:59:59.999999999) - -Current Behavior: - -Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as -is the case with :attr:`Period.end_time`, for example - -.. ipython:: python - - p = pd.Period('2017-01-01', 'D') - pi = pd.PeriodIndex([p]) - - pd.Series(pi).dt.end_time[0] - - p.end_time - -.. _whatsnew_0240.api.datetimelike.normalize: - -Tick DateOffset Normalize Restrictions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, -:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with -``normalize=True`` is no longer supported. This prevents unexpected behavior -where addition could fail to be monotone or associative. (:issue:`21427`) - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') - - In [3]: ts - Out[3]: Timestamp('2018-06-11 18:01:14') - - In [4]: tic = pd.offsets.Hour(n=2, normalize=True) - ...: - - In [5]: tic - Out[5]: <2 * Hours> - - In [6]: ts + tic - Out[6]: Timestamp('2018-06-11 00:00:00') - - In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) - Out[7]: False - -*Current Behavior*: - -.. ipython:: python - - ts = pd.Timestamp('2018-06-11 18:01:14') - tic = pd.offsets.Hour(n=2) - ts + tic + tic + tic == ts + (tic + tic + tic) - - -.. _whatsnew_0240.api.datetimelike: - - -.. _whatsnew_0240.api.period_subtraction: - -Period Subtraction -^^^^^^^^^^^^^^^^^^ - -Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. -instead of an integer (:issue:`21314`) - -.. ipython:: python - - june = pd.Period('June 2018') - april = pd.Period('April 2018') - june - april - -Previous Behavior: - -.. code-block:: ipython - - In [2]: june = pd.Period('June 2018') - - In [3]: april = pd.Period('April 2018') - - In [4]: june - april - Out [4]: 2 - -Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return -an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` - -.. ipython:: python - - pi = pd.period_range('June 2018', freq='M', periods=3) - pi - pi[0] - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) - - In [3]: pi - pi[0] - Out[3]: Int64Index([0, 1, 2], dtype='int64') - - -.. _whatsnew_0240.api.timedelta64_subtract_nan: - -Addition/Subtraction of ``NaN`` from :class:`DataFrame` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Adding or subtracting ``NaN`` from a :class:`DataFrame` column with -``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning -all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and -``Series`` behavior (:issue:`22163`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame([pd.Timedelta(days=1)]) - df - np.nan - -Previous Behavior: - -.. code-block:: ipython - - In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) - - In [5]: df - np.nan - Out[5]: - 0 - 0 NaT - - -.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: - -DataFrame Arithmetic Operations Broadcasting Changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`DataFrame` arithmetic operations when operating with 2-dimensional -``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s -broadcast. (:issue:`23000`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: arr = np.arange(6).reshape(3, 2) - In [4]: df = pd.DataFrame(arr) - In [5]: df + arr[[0], :] # 1 row, 2 columns - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) - In [6]: df + arr[:, [1]] # 1 column, 3 rows - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) - -*Current Behavior*: - -.. ipython:: python - arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr) - df - -.. ipython:: python - df + arr[[0], :] # 1 row, 2 columns - df + arr[:, [1]] # 1 column, 3 rows - - -.. _whatsnew_0240.api.extension: - -ExtensionType Changes -^^^^^^^^^^^^^^^^^^^^^ - -- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) -- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore - the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) -- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) -- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). -- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) -- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) -- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) -- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). -- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) -- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) - -.. _whatsnew_0240.api.incompatibilities: - -Series and Index Data-Dtype Incompatibilities -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series`` and ``Index`` constructors now raise when the -data is incompatible with a passed ``dtype=`` (:issue:`15832`) - -Previous Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - 0 18446744073709551615 - dtype: uint64 - -Current Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - ... - OverflowError: Trying to coerce negative values to unsigned integers - -.. _whatsnew_0240.api.crosstab_dtypes - -Crosstab Preserves Dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`crosstab` will preserve now dtypes in some cases that previously would -cast from integer dtype to floating dtype (:issue:`22019`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - Out[4]: - b 3 4 - a - 1 0.5 0.0 - 2 0.5 1.0 - -Current Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - -Datetimelike API Changes -^^^^^^^^^^^^^^^^^^^^^^^^ - -- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) -- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) -- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) -- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) -- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) - -.. _whatsnew_0240.api.other: - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) -- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in - :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of - a ``KeyError`` (:issue:`21678`). -- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) -- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) -- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) -- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) -- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) -- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) - -.. _whatsnew_0240.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) -- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) -- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) -- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) -- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) -- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) -- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) -- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain - many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) -- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) -- :func:`DatetimeIndex.shift` now accepts ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`) - -.. _whatsnew_0240.prior_deprecations: - -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) -- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) -- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) -- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) -- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) -- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) -- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) -- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) -- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) -- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) -- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) - -.. _whatsnew_0240.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) -- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) -- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) -- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` - (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`, :issue:`21508`) -- Improved performance of :meth:`HDFStore.groups` (and dependent functions like - :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) - (:issue:`21372`) -- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) -- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) - - -.. _whatsnew_0240.docs: - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) -- -- - -.. _whatsnew_0240.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. -- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) -- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - -Datetimelike -^^^^^^^^^^^^ - -- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) -- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) -- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) -- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) -- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) -- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) -- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) -- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) -- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) -- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) -- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) -- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) - -Timedelta -^^^^^^^^^ -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) -- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) -- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) -- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) -- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) -- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) -- - -Timezones -^^^^^^^^^ - -- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) -- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) -- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) -- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) -- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) -- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) -- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) -- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) -- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) -- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) -- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) -- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) -- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) -- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) -- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) -- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) -- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) -- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) - -Offsets -^^^^^^^ - -- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) -- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) -- - -Numeric -^^^^^^^ - -- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) -- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) -- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) -- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, - when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), - a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). -- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) -- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) -- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) -- - -Strings -^^^^^^^ - -- -- -- - -Interval -^^^^^^^^ - -- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) -- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) -- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) -- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) - -Indexing -^^^^^^^^ - -- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) -- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) -- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) -- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) -- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) -- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) -- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) -- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) -- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) -- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) -- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) -- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) -- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) - -Missing -^^^^^^^ - -- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) -- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) -- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) -- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) - - -MultiIndex -^^^^^^^^^^ - -- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) -- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) -- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) - -I/O -^^^ - -- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) -- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) -- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) -- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) -- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) -- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) -- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) - -Plotting -^^^^^^^^ - -- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) -- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) - -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) -- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) -- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a - ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). -- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a - datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). -- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). -- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). -- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) -- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) - -Sparse -^^^^^^ - -- -- -- - -Reshaping -^^^^^^^^^ - -- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) -- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) -- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) -- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) -- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) -- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) -- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) -- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) -- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) -- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) -- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) - -Build Changes -^^^^^^^^^^^^^ - -- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) -- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) -- - -Other -^^^^^ - -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) -- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) -- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) -- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. -- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) diff --git a/doc/source/whatsnew/v0.4.x.txt b/doc/source/whatsnew/v0.4.x.rst similarity index 97% rename from doc/source/whatsnew/v0.4.x.txt rename to doc/source/whatsnew/v0.4.x.rst index ed9352059a6dc..e54614849c93b 100644 --- a/doc/source/whatsnew/v0.4.x.txt +++ b/doc/source/whatsnew/v0.4.x.rst @@ -3,6 +3,8 @@ v.0.4.3 through v0.4.1 (September 25 - October 9, 2011) ------------------------------------------------------- +{{ common_imports }} + New Features ~~~~~~~~~~~~ @@ -61,3 +63,7 @@ Performance Enhancements .. _ENHed: https://github.com/pandas-dev/pandas/commit/edd9f1945fc010a57fa0ae3b3444d1fffe592591 .. _ENH56: https://github.com/pandas-dev/pandas/commit/56e0c9ffafac79ce262b55a6a13e1b10a88fbe93 +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.4.1..v0.4.3 diff --git a/doc/source/whatsnew/v0.5.0.txt b/doc/source/whatsnew/v0.5.0.rst similarity index 96% rename from doc/source/whatsnew/v0.5.0.txt rename to doc/source/whatsnew/v0.5.0.rst index 6fe6a02b08f70..c6d17cb1e1290 100644 --- a/doc/source/whatsnew/v0.5.0.txt +++ b/doc/source/whatsnew/v0.5.0.rst @@ -4,6 +4,8 @@ v.0.5.0 (October 24, 2011) -------------------------- +{{ common_imports }} + New Features ~~~~~~~~~~~~ @@ -41,3 +43,11 @@ Performance Enhancements .. _ENH61: https://github.com/pandas-dev/pandas/commit/6141961 .. _ENH5c: https://github.com/pandas-dev/pandas/commit/5ca6ff5d822ee4ddef1ec0d87b6d83d8b4bbd3eb + + +.. _whatsnew_0.5.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.4.0..v0.5.0 diff --git a/doc/source/whatsnew/v0.6.0.txt b/doc/source/whatsnew/v0.6.0.rst similarity index 97% rename from doc/source/whatsnew/v0.6.0.txt rename to doc/source/whatsnew/v0.6.0.rst index bd01dd0a90a59..de45b3b383129 100644 --- a/doc/source/whatsnew/v0.6.0.txt +++ b/doc/source/whatsnew/v0.6.0.rst @@ -3,6 +3,8 @@ v.0.6.0 (November 25, 2011) --------------------------- +{{ common_imports }} + New Features ~~~~~~~~~~~~ - :ref:`Added ` ``melt`` function to ``pandas.core.reshape`` @@ -54,3 +56,11 @@ Performance Enhancements - VBENCH Significantly improved performance of ``Series.order``, which also makes np.unique called on a Series faster (:issue:`327`) - VBENCH Vastly improved performance of GroupBy on axes with a MultiIndex (:issue:`299`) + + +.. _whatsnew_0.6.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.5.0..v0.6.0 diff --git a/doc/source/whatsnew/v0.6.1.txt b/doc/source/whatsnew/v0.6.1.rst similarity index 96% rename from doc/source/whatsnew/v0.6.1.txt rename to doc/source/whatsnew/v0.6.1.rst index acd5b0774f2bb..d01757775d694 100644 --- a/doc/source/whatsnew/v0.6.1.txt +++ b/doc/source/whatsnew/v0.6.1.rst @@ -48,3 +48,11 @@ Performance improvements - Column deletion in DataFrame copies no data (computes views on blocks) (GH #158) + + +.. _whatsnew_0.6.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.6.0..v0.6.1 diff --git a/doc/source/whatsnew/v0.7.0.txt b/doc/source/whatsnew/v0.7.0.rst similarity index 98% rename from doc/source/whatsnew/v0.7.0.txt rename to doc/source/whatsnew/v0.7.0.rst index 21d91950e7b78..e278bc0738108 100644 --- a/doc/source/whatsnew/v0.7.0.txt +++ b/doc/source/whatsnew/v0.7.0.rst @@ -3,6 +3,8 @@ v.0.7.0 (February 9, 2012) -------------------------- +{{ common_imports }} + New features ~~~~~~~~~~~~ @@ -298,3 +300,11 @@ Performance improvements ``level`` parameter passed (:issue:`545`) - Ported skiplist data structure to C to speed up ``rolling_median`` by about 5-10x in most typical use cases (:issue:`374`) + + +.. _whatsnew_0.7.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.6.1..v0.7.0 diff --git a/doc/source/whatsnew/v0.7.1.txt b/doc/source/whatsnew/v0.7.1.rst similarity index 90% rename from doc/source/whatsnew/v0.7.1.txt rename to doc/source/whatsnew/v0.7.1.rst index bc12cb8d200cd..f1a133797fd59 100644 --- a/doc/source/whatsnew/v0.7.1.txt +++ b/doc/source/whatsnew/v0.7.1.rst @@ -3,6 +3,8 @@ v.0.7.1 (February 29, 2012) --------------------------- +{{ common_imports }} + This release includes a few new features and addresses over a dozen bugs in 0.7.0. @@ -28,3 +30,11 @@ Performance improvements - Improve performance and memory usage of fillna on DataFrame - Can concatenate a list of Series along axis=1 to obtain a DataFrame (:issue:`787`) + + +.. _whatsnew_0.7.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.7.0..v0.7.1 diff --git a/doc/source/whatsnew/v0.7.2.txt b/doc/source/whatsnew/v0.7.2.rst similarity index 89% rename from doc/source/whatsnew/v0.7.2.txt rename to doc/source/whatsnew/v0.7.2.rst index c711639354139..b870db956f4f1 100644 --- a/doc/source/whatsnew/v0.7.2.txt +++ b/doc/source/whatsnew/v0.7.2.rst @@ -3,6 +3,8 @@ v.0.7.2 (March 16, 2012) --------------------------- +{{ common_imports }} + This release targets bugs in 0.7.1, and adds a few minor features. New features @@ -25,3 +27,11 @@ Performance improvements - Use khash for Series.value_counts, add raw function to algorithms.py (:issue:`861`) - Intercept __builtin__.sum in groupby (:issue:`885`) + + +.. _whatsnew_0.7.2.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.7.1..v0.7.2 diff --git a/doc/source/whatsnew/v0.7.3.txt b/doc/source/whatsnew/v0.7.3.rst similarity index 92% rename from doc/source/whatsnew/v0.7.3.txt rename to doc/source/whatsnew/v0.7.3.rst index 77cc72d8707cf..30e22f105656c 100644 --- a/doc/source/whatsnew/v0.7.3.txt +++ b/doc/source/whatsnew/v0.7.3.rst @@ -3,6 +3,8 @@ v.0.7.3 (April 12, 2012) ------------------------ +{{ common_imports }} + This is a minor release from 0.7.2 and fixes many minor bugs and adds a number of nice new features. There are also a couple of API changes to note; these should not affect very many users, and we are inclined to call them "bug fixes" @@ -22,7 +24,7 @@ New features from pandas.tools.plotting import scatter_matrix scatter_matrix(df, alpha=0.2) -.. image:: savefig/scatter_matrix_kde.png +.. image:: ../savefig/scatter_matrix_kde.png :width: 5in - Add ``stacked`` argument to Series and DataFrame's ``plot`` method for @@ -32,14 +34,14 @@ New features df.plot(kind='bar', stacked=True) -.. image:: savefig/bar_plot_stacked_ex.png +.. image:: ../savefig/bar_plot_stacked_ex.png :width: 4in .. code-block:: python df.plot(kind='barh', stacked=True) -.. image:: savefig/barh_plot_stacked_ex.png +.. image:: ../savefig/barh_plot_stacked_ex.png :width: 4in - Add log x and y :ref:`scaling options ` to @@ -94,3 +96,11 @@ Series, to be more consistent with the ``groupby`` behavior with DataFrame: grouped = df.groupby('A')['C'] grouped.describe() grouped.apply(lambda x: x.sort_values()[-2:]) # top 2 values + + +.. _whatsnew_0.7.3.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.7.2..v0.7.3 diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.rst similarity index 99% rename from doc/source/whatsnew/v0.8.0.txt rename to doc/source/whatsnew/v0.8.0.rst index 28c043e772605..eedaaa3dfa8bd 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/whatsnew/v0.8.0.rst @@ -3,6 +3,8 @@ v0.8.0 (June 29, 2012) ------------------------ +{{ common_imports }} + This is a major release from 0.7.3 and includes extensive work on the time series handling and processing infrastructure as well as a great deal of new functionality throughout the library. It includes over 700 commits from more @@ -269,3 +271,11 @@ unique. In many cases it will no longer fail (some method like ``append`` still check for uniqueness unless disabled). However, all is not lost: you can inspect ``index.is_unique`` and raise an exception explicitly if it is ``False`` or go to a different code branch. + + +.. _whatsnew_0.8.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.7.3..v0.8.0 diff --git a/doc/source/whatsnew/v0.8.1.txt b/doc/source/whatsnew/v0.8.1.rst similarity index 93% rename from doc/source/whatsnew/v0.8.1.txt rename to doc/source/whatsnew/v0.8.1.rst index add96bec9d1dd..468b99341163c 100644 --- a/doc/source/whatsnew/v0.8.1.txt +++ b/doc/source/whatsnew/v0.8.1.rst @@ -3,6 +3,8 @@ v0.8.1 (July 22, 2012) ---------------------- +{{ common_imports }} + This release includes a few new features, performance enhancements, and over 30 bug fixes from 0.8.0. New features include notably NA friendly string processing functionality and a series of new plot types and options. @@ -34,3 +36,11 @@ Performance improvements Categorical types - Significant datetime parsing performance improvements + + +.. _whatsnew_0.8.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.8.0..v0.8.1 diff --git a/doc/source/whatsnew/v0.9.0.txt b/doc/source/whatsnew/v0.9.0.rst similarity index 96% rename from doc/source/whatsnew/v0.9.0.txt rename to doc/source/whatsnew/v0.9.0.rst index b60fb9cc64f4a..ee4e8c338c984 100644 --- a/doc/source/whatsnew/v0.9.0.txt +++ b/doc/source/whatsnew/v0.9.0.rst @@ -1,9 +1,6 @@ .. _whatsnew_0900: -.. ipython:: python - :suppress: - - from pandas.compat import StringIO +{{ common_imports }} v0.9.0 (October 7, 2012) ------------------------ @@ -95,3 +92,11 @@ See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. + + +.. _whatsnew_0.9.0.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.8.1..v0.9.0 diff --git a/doc/source/whatsnew/v0.9.1.txt b/doc/source/whatsnew/v0.9.1.rst similarity index 97% rename from doc/source/whatsnew/v0.9.1.txt rename to doc/source/whatsnew/v0.9.1.rst index 1f58170b30244..fe3de9be95a74 100644 --- a/doc/source/whatsnew/v0.9.1.txt +++ b/doc/source/whatsnew/v0.9.1.rst @@ -1,13 +1,10 @@ .. _whatsnew_0901: -.. ipython:: python - :suppress: - - from pandas.compat import StringIO - v0.9.1 (November 14, 2012) -------------------------- +{{ common_imports }} + This is a bug fix release from 0.9.0 and includes several new features and enhancements along with a large number of bug fixes. The new features include by-column sort order for DataFrame and Series, improved NA handling for the rank @@ -158,3 +155,11 @@ API changes See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. + + +.. _whatsnew_0.9.1.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.9.0..v0.9.1 diff --git a/scripts/announce.py b/doc/sphinxext/announce.py similarity index 75% rename from scripts/announce.py rename to doc/sphinxext/announce.py index 7b7933eba54dd..6bc53d3e96d01 100755 --- a/scripts/announce.py +++ b/doc/sphinxext/announce.py @@ -33,19 +33,21 @@ $ ./scripts/announce.py $GITHUB v1.11.0..v1.11.1 > announce.rst """ -from __future__ import print_function, division +from __future__ import division, print_function +import codecs import os import re -import codecs +import textwrap + from git import Repo UTF8Writer = codecs.getwriter('utf8') -this_repo = Repo(os.path.join(os.path.dirname(__file__), "..")) +this_repo = Repo(os.path.join(os.path.dirname(__file__), "..", "..")) author_msg = """\ -A total of %d people contributed to this release. People with a "+" by their -names contributed a patch for the first time. +A total of %d people contributed patches to this release. People with a +"+" by their names contributed a patch for the first time. """ pull_request_msg = """\ @@ -98,19 +100,35 @@ def get_pull_requests(repo, revision_range): return prs -def main(revision_range, repo): +def build_components(revision_range, heading="Contributors"): lst_release, cur_release = [r.strip() for r in revision_range.split('..')] - - # document authors authors = get_authors(revision_range) - heading = u"Contributors" - print() - print(heading) - print(u"=" * len(heading)) - print(author_msg % len(authors)) - for s in authors: - print(u'* ' + s) + return { + 'heading': heading, + 'author_message': author_msg % len(authors), + 'authors': authors, + } + + +def build_string(revision_range, heading="Contributors"): + components = build_components(revision_range, heading=heading) + components['uline'] = '=' * len(components['heading']) + components['authors'] = "* " + "\n* ".join(components['authors']) + + tpl = textwrap.dedent("""\ + {heading} + {uline} + + {author_message} + {authors}""").format(**components) + return tpl + + +def main(revision_range): + # document authors + text = build_string(revision_range) + print(text) if __name__ == "__main__": @@ -118,7 +136,5 @@ def main(revision_range, repo): parser = ArgumentParser(description="Generate author lists for release") parser.add_argument('revision_range', help='..') - parser.add_argument('--repo', help="Github org/repository", - default="pandas-dev/pandas") args = parser.parse_args() - main(args.revision_range, args.repo) + main(args.revision_range) diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py new file mode 100644 index 0000000000000..8c9fa5bc961d1 --- /dev/null +++ b/doc/sphinxext/contributors.py @@ -0,0 +1,49 @@ +"""Sphinx extension for listing code contributors to a release. + +Usage:: + + .. contributors:: v0.23.0..v0.23.1 + +This will be replaced with a message indicating the number of +code contributors and commits, and then list each contributor +individually. +""" +from docutils import nodes +from docutils.parsers.rst import Directive +import git + +from announce import build_components + + +class ContributorsDirective(Directive): + required_arguments = 1 + name = 'contributors' + + def run(self): + range_ = self.arguments[0] + try: + components = build_components(range_) + except git.GitCommandError: + return [ + self.state.document.reporter.warning( + "Cannot find contributors for range '{}'".format(range_), + line=self.lineno) + ] + else: + message = nodes.paragraph() + message += nodes.Text(components['author_message']) + + listnode = nodes.bullet_list() + + for author in components['authors']: + para = nodes.paragraph() + para += nodes.Text(author) + listnode += nodes.list_item('', para) + + return [message, listnode] + + +def setup(app): + app.add_directive('contributors', ContributorsDirective) + + return {'version': '0.1'} diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000..e31511e5b8afe --- /dev/null +++ b/environment.yml @@ -0,0 +1,53 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + # required + - numpy>=1.15 + - python=3 + - python-dateutil>=2.5.0 + - pytz + + # development + - asv + - cython>=0.28.2 + - flake8 + - flake8-comprehensions + - flake8-rst>=0.6.0 + - gitpython + - hypothesis>=3.82 + - isort + - moto + - pytest>=4.0 + - sphinx + - sphinxcontrib-spelling + + # optional + - beautifulsoup4>=4.2.1 + - blosc + - bottleneck>=1.2.0 + - fastparquet>=0.1.2 + - html5lib + - ipython>=5.6.0 + - ipykernel + - jinja2 + - lxml + - matplotlib>=2.0.0 + - nbsphinx + - numexpr>=2.6.8 + - openpyxl + - pyarrow>=0.7.0 + - pytables>=3.4.2 + - pytest-cov + - pytest-xdist + - scipy>=1.1 + - seaborn + - sqlalchemy + - statsmodels + - xarray + - xlrd + - xlsxwriter + - xlwt + - pip: + - cpplint diff --git a/pandas/__init__.py b/pandas/__init__.py index e446782d9665e..e86ed86fda74f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -45,37 +45,11 @@ from pandas.core.computation.api import * from pandas.core.reshape.api import * -# deprecate tools.plotting, plot_params and scatter_matrix on the top namespace -import pandas.tools.plotting -plot_params = pandas.plotting._style._Options(deprecated=True) -# do not import deprecate to top namespace -scatter_matrix = pandas.util._decorators.deprecate( - 'pandas.scatter_matrix', pandas.plotting.scatter_matrix, '0.20.0', - 'pandas.plotting.scatter_matrix') - from pandas.util._print_versions import show_versions from pandas.io.api import * from pandas.util._tester import test import pandas.testing -# extension module deprecations -from pandas.util._depr_module import _DeprecatedModule - -parser = _DeprecatedModule(deprmod='pandas.parser', - removals=['na_values'], - moved={'CParserError': 'pandas.errors.ParserError'}) -lib = _DeprecatedModule(deprmod='pandas.lib', deprmodto=False, - moved={'Timestamp': 'pandas.Timestamp', - 'Timedelta': 'pandas.Timedelta', - 'NaT': 'pandas.NaT', - 'infer_dtype': 'pandas.api.types.infer_dtype'}) -tslib = _DeprecatedModule(deprmod='pandas.tslib', - moved={'Timestamp': 'pandas.Timestamp', - 'Timedelta': 'pandas.Timedelta', - 'NaT': 'pandas.NaT', - 'NaTType': 'type(pandas.NaT)', - 'OutOfBoundsDatetime': 'pandas.errors.OutOfBoundsDatetime'}) - # use the closest tagged version if possible from ._version import get_versions v = get_versions() diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 0888cf3c85f2f..5df1e381ea3ce 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -1,9 +1,6 @@ from util cimport numeric -cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil - - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: cdef: numeric t diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d2914dc8ac751..e77899507833f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -10,12 +10,12 @@ from libc.math cimport fabs, sqrt import numpy as np cimport numpy as cnp from numpy cimport (ndarray, - NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8, NPY_FLOAT32, NPY_FLOAT64, NPY_OBJECT, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, - double_t) + uint32_t, uint64_t, float32_t, float64_t) cnp.import_array() @@ -31,10 +31,9 @@ import missing cdef float64_t FP_ERR = 1e-13 -cdef double NaN = np.NaN -cdef double nan = NaN +cdef float64_t NaN = np.NaN -cdef int64_t iNaT = get_nat() +cdef int64_t NPY_NAT = get_nat() tiebreakers = { 'average': TIEBREAK_AVERAGE, @@ -76,6 +75,8 @@ class NegInfinity(object): __ge__ = lambda self, other: isinstance(other, NegInfinity) +@cython.wraparound(False) +@cython.boundscheck(False) cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): """ Efficiently find the unique first-differences of the given array. @@ -125,11 +126,11 @@ def is_lexsorted(list_of_arrays: list) -> bint: nlevels = len(list_of_arrays) n = len(list_of_arrays[0]) - cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) for i in range(nlevels): arr = list_of_arrays[i] assert arr.dtype.name == 'int64' - vecs[i] = cnp.PyArray_DATA(arr) + vecs[i] = cnp.PyArray_DATA(arr) # Assume uniqueness?? with nogil: @@ -196,7 +197,7 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): @cython.boundscheck(False) @cython.wraparound(False) -cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil: +def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric: cdef: Py_ssize_t i, j, l, m, n = a.shape[0] numeric x @@ -239,7 +240,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None): int64_t nobs = 0 float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor - N, K = ( mat).shape + N, K = (mat).shape if minp is None: minpv = 1 @@ -304,7 +305,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): int64_t nobs = 0 float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor - N, K = ( mat).shape + N, K = (mat).shape result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) @@ -359,9 +360,13 @@ ctypedef fused algos_t: float64_t float32_t object - int32_t int64_t + int32_t + int16_t + int8_t uint64_t + uint32_t + uint16_t uint8_t @@ -402,7 +407,7 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): nleft = len(old) nright = len(new) indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) + indexer[:] = -1 if limit is None: lim = nright @@ -459,7 +464,12 @@ pad_float32 = pad["float32_t"] pad_object = pad["object"] pad_int64 = pad["int64_t"] pad_int32 = pad["int32_t"] +pad_int16 = pad["int16_t"] +pad_int8 = pad["int8_t"] pad_uint64 = pad["uint64_t"] +pad_uint32 = pad["uint32_t"] +pad_uint16 = pad["uint16_t"] +pad_uint8 = pad["uint8_t"] pad_bool = pad["uint8_t"] @@ -519,7 +529,7 @@ def pad_2d_inplace(ndarray[algos_t, ndim=2] values, algos_t val int lim, fill_count = 0 - K, N = ( values).shape + K, N = (values).shape # GH#2778 if N == 0: @@ -595,7 +605,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): nleft = len(old) nright = len(new) indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) + indexer[:] = -1 if limit is None: lim = nright @@ -653,7 +663,12 @@ backfill_float32 = backfill["float32_t"] backfill_object = backfill["object"] backfill_int64 = backfill["int64_t"] backfill_int32 = backfill["int32_t"] +backfill_int16 = backfill["int16_t"] +backfill_int8 = backfill["int8_t"] backfill_uint64 = backfill["uint64_t"] +backfill_uint32 = backfill["uint32_t"] +backfill_uint16 = backfill["uint16_t"] +backfill_uint8 = backfill["uint8_t"] backfill_bool = backfill["uint8_t"] @@ -713,7 +728,7 @@ def backfill_2d_inplace(ndarray[algos_t, ndim=2] values, algos_t val int lim, fill_count = 0 - K, N = ( values).shape + K, N = (values).shape # GH#2778 if N == 0: @@ -778,7 +793,7 @@ arrmap_bool = arrmap["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(ndarray[algos_t] arr, bint timelike): +def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): """ Returns ------- @@ -795,7 +810,7 @@ def is_monotonic(ndarray[algos_t] arr, bint timelike): n = len(arr) if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + if arr[0] != arr[0] or (timelike and arr[0] == NPY_NAT): # single value is NaN return False, False, True else: @@ -803,7 +818,7 @@ def is_monotonic(ndarray[algos_t] arr, bint timelike): elif n < 2: return True, True, True - if timelike and arr[0] == iNaT: + if timelike and arr[0] == NPY_NAT: return False, False, True if algos_t is not object: @@ -811,7 +826,7 @@ def is_monotonic(ndarray[algos_t] arr, bint timelike): prev = arr[0] for i in range(1, n): cur = arr[i] - if timelike and cur == iNaT: + if timelike and cur == NPY_NAT: is_monotonic_inc = 0 is_monotonic_dec = 0 break @@ -836,7 +851,7 @@ def is_monotonic(ndarray[algos_t] arr, bint timelike): prev = arr[0] for i in range(1, n): cur = arr[i] - if timelike and cur == iNaT: + if timelike and cur == NPY_NAT: is_monotonic_inc = 0 is_monotonic_dec = 0 break @@ -866,7 +881,12 @@ is_monotonic_float32 = is_monotonic["float32_t"] is_monotonic_object = is_monotonic["object"] is_monotonic_int64 = is_monotonic["int64_t"] is_monotonic_int32 = is_monotonic["int32_t"] +is_monotonic_int16 = is_monotonic["int16_t"] +is_monotonic_int8 = is_monotonic["int8_t"] is_monotonic_uint64 = is_monotonic["uint64_t"] +is_monotonic_uint32 = is_monotonic["uint32_t"] +is_monotonic_uint16 = is_monotonic["uint16_t"] +is_monotonic_uint8 = is_monotonic["uint8_t"] is_monotonic_bool = is_monotonic["uint8_t"] diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 9f531f36d1a64..3708deb1a4b76 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -1,53 +1,40 @@ """ Template for each `dtype` helper function using 1-d template -# 1-d template -- pad -- pad_1d -- pad_2d -- backfill -- backfill_1d -- backfill_2d -- is_monotonic -- arrmap - WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ {{py: -# name, c_type, dest_type, dest_dtype -dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), - ('float32', 'float32_t', 'float32_t', 'np.float32'), - ('int8', 'int8_t', 'float32_t', 'np.float32'), - ('int16', 'int16_t', 'float32_t', 'np.float32'), - ('int32', 'int32_t', 'float64_t', 'np.float64'), - ('int64', 'int64_t', 'float64_t', 'np.float64')] +# name, c_type, dest_type +dtypes = [('float64', 'float64_t', 'float64_t'), + ('float32', 'float32_t', 'float32_t'), + ('int8', 'int8_t', 'float32_t'), + ('int16', 'int16_t', 'float32_t'), + ('int32', 'int32_t', 'float64_t'), + ('int64', 'int64_t', 'float64_t')] def get_dispatch(dtypes): - for name, c_type, dest_type, dest_dtype, in dtypes: - - dest_type2 = dest_type - dest_type = dest_type.replace('_t', '') - - yield name, c_type, dest_type, dest_type2, dest_dtype + for name, c_type, dest_type, in dtypes: + dest_name = dest_type[:-2] # i.e. strip "_t" + yield name, c_type, dest_type, dest_name }} -{{for name, c_type, dest_type, dest_type2, dest_dtype +{{for name, c_type, dest_type, dest_name in get_dispatch(dtypes)}} @cython.boundscheck(False) @cython.wraparound(False) def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, - ndarray[{{dest_type2}}, ndim=2] out, + ndarray[{{dest_type}}, ndim=2] out, Py_ssize_t periods, int axis): cdef: Py_ssize_t i, j, sx, sy - sx, sy = ( arr).shape + sx, sy = (arr).shape if arr.flags.f_contiguous: if axis == 0: if periods >= 0: @@ -84,9 +71,9 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, out[i, j] = arr[i, j] - arr[i, j - periods] -def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values, +def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values, ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[{{dest_type2}}] out): + ndarray[{{dest_type}}] out): cdef: Py_ssize_t i, j, k @@ -97,18 +84,18 @@ def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values, {{endfor}} -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # ensure_dtype -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num +cdef int PLATFORM_INT = (np.arange(0, dtype=np.intp)).descr.type_num def ensure_platform_int(object arr): # GH3033, GH1392 # platform int is the size of the int pointer, e.g. np.intp if util.is_array(arr): - if ( arr).descr.type_num == PLATFORM_INT: + if (arr).descr.type_num == PLATFORM_INT: return arr else: return arr.astype(np.intp) @@ -118,7 +105,7 @@ def ensure_platform_int(object arr): def ensure_object(object arr): if util.is_array(arr): - if ( arr).descr.type_num == NPY_OBJECT: + if (arr).descr.type_num == NPY_OBJECT: return arr else: return arr.astype(np.object_) @@ -136,6 +123,9 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('int16', 'INT16', 'int16'), ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('uint8', 'UINT8', 'uint8'), + ('uint16', 'UINT16', 'uint16'), + ('uint32', 'UINT32', 'uint32'), ('uint64', 'UINT64', 'uint64'), # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), @@ -149,9 +139,10 @@ def get_dispatch(dtypes): {{for name, c_type, dtype in get_dispatch(dtypes)}} + def ensure_{{name}}(object arr, copy=True): if util.is_array(arr): - if ( arr).descr.type_num == NPY_{{c_type}}: + if (arr).descr.type_num == NPY_{{c_type}}: return arr else: return arr.astype(np.{{dtype}}, copy=copy) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 130276ae0e73c..5dac94394c7ed 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -4,9 +4,9 @@ Template for each `dtype` helper function for rank WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # rank_1d, rank_2d -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -53,7 +53,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', int tiebreak = 0 bint keep_na = 0 bint isnan - float count = 0.0 + float64_t count = 0.0 tiebreak = tiebreakers[ties_method] {{if dtype == 'float64'}} @@ -74,9 +74,9 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', {{elif dtype == 'float64'}} mask = np.isnan(values) {{elif dtype == 'int64'}} - mask = values == iNaT + mask = values == NPY_NAT - # create copy in case of iNaT + # create copy in case of NPY_NAT # values are mutated inplace if mask.any(): values = values.copy() @@ -102,15 +102,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ranks = np.empty(n, dtype='f8') {{if dtype == 'object'}} - - try: - _as = np.lexsort(keys=order) - except TypeError: - # lexsort on object array will raise TypeError for numpy version - # earlier than 1.11.0. Use argsort with order argument instead. - _dt = [('values', 'O'), ('mask', '?')] - _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt) - _as = np.argsort(_values, kind='mergesort', order=('mask', 'values')) + _as = np.lexsort(keys=order) {{else}} if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here @@ -126,62 +118,43 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', sorted_data = values.take(_as) sorted_mask = mask.take(_as) - _indices = np.diff(sorted_mask).nonzero()[0] + _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] non_na_idx = _indices[0] if len(_indices) > 0 else -1 argsorted = _as.astype('i8') {{if dtype == 'object'}} - for i in range(n): - sum_ranks += i + 1 - dups += 1 - isnan = sorted_mask[i] - val = util.get_value_at(sorted_data, i) - - if isnan and keep_na: - ranks[argsorted[i]] = nan - continue - count += 1.0 - - if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val) or - i == non_na_idx): - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - raise ValueError('first not supported for non-numeric data') - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 + if True: {{else}} with nogil: + {{endif}} + # TODO: why does the 2d version not have a nogil block? for i in range(n): sum_ranks += i + 1 dups += 1 + + {{if dtype == 'object'}} + val = util.get_value_at(sorted_data, i) + {{else}} val = sorted_data[i] + {{endif}} {{if dtype != 'uint64'}} isnan = sorted_mask[i] if isnan and keep_na: - ranks[argsorted[i]] = nan + ranks[argsorted[i]] = NaN continue {{endif}} count += 1.0 - if (i == n - 1 or sorted_data[i + 1] != val or - i == non_na_idx): + {{if dtype == 'object'}} + if (i == n - 1 or + are_diff(util.get_value_at(sorted_data, i + 1), val) or + i == non_na_idx): + {{else}} + if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx): + {{endif}} + if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -192,8 +165,13 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: + {{if dtype == 'object'}} + raise ValueError('first not supported for ' + 'non-numeric data') + {{else}} for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 + {{endif}} elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 @@ -202,7 +180,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 - {{endif}} if pct: if tiebreak == TIEBREAK_DENSE: return ranks / total_tie_count @@ -243,7 +220,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 - float count = 0.0 + float64_t count = 0.0 tiebreak = tiebreakers[ties_method] @@ -272,13 +249,13 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{elif dtype == 'float64'}} mask = np.isnan(values) {{elif dtype == 'int64'}} - mask = values == iNaT + mask = values == NPY_NAT {{endif}} np.putmask(values, mask, nan_value) {{endif}} - n, k = ( values).shape + n, k = (values).shape ranks = np.empty((n, k), dtype='f8') {{if dtype == 'object'}} @@ -332,7 +309,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{else}} if (val == nan_value) and keep_na: {{endif}} - ranks[i, argsorted[i, j]] = nan + ranks[i, argsorted[i, j]] = NaN {{if dtype == 'object'}} infs += 1 diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 358479c837d05..2fea8b17fd9d7 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -4,9 +4,9 @@ Template for each `dtype` helper function for take WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # take_1d, take_2d -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -278,7 +278,7 @@ cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): ndarray[take_t, ndim=2] result object val - N, K = ( values).shape + N, K = (values).shape if take_t is object: # evaluated at compile-time diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d683c93c9b32e..7c16b29f3e42b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,14 +1,13 @@ # -*- coding: utf-8 -*- -cimport cython -from cython cimport Py_ssize_t +import cython +from cython import Py_ssize_t from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp from numpy cimport (ndarray, - double_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t) cnp.import_array() @@ -20,10 +19,9 @@ from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers -cdef int64_t iNaT = get_nat() +cdef int64_t NPY_NAT = get_nat() -cdef double NaN = np.NaN -cdef double nan = NaN +cdef float64_t NaN = np.NaN cdef inline float64_t median_linear(float64_t* a, int n) nogil: @@ -44,7 +42,7 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: if na_count == n: return NaN - tmp = malloc((n - na_count) * sizeof(float64_t)) + tmp = malloc((n - na_count) * sizeof(float64_t)) j = 0 for i in range(n): @@ -67,13 +65,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: return result -# TODO: Is this redundant with algos.kth_smallest? +# TODO: Is this redundant with algos.kth_smallest cdef inline float64_t kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n) nogil: cdef: Py_ssize_t i, j, l, m - double_t x, t + float64_t x, t l = 0 m = n - 1 @@ -109,19 +107,19 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, cdef: Py_ssize_t i, j, N, K, ngroups, size ndarray[int64_t] _counts - ndarray data + ndarray[float64_t, ndim=2] data float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" ngroups = len(counts) - N, K = ( values).shape + N, K = (values).shape indexer, _counts = groupsort_indexer(labels, ngroups) counts[:] = _counts[1:] data = np.empty((K, N), dtype=np.float64) - ptr = cnp.PyArray_DATA(data) + ptr = cnp.PyArray_DATA(data) take_2d_axis1_float64_float64(values.T, indexer, out=data) @@ -139,8 +137,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cumprod_float64(float64_t[:, :] out, - float64_t[:, :] values, - int64_t[:] labels, + const float64_t[:, :] values, + const int64_t[:] labels, bint is_datetimelike, bint skipna=True): """ @@ -152,7 +150,7 @@ def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] accum int64_t lab - N, K = ( values).shape + N, K = (values).shape accum = np.ones_like(values) with nogil: @@ -177,7 +175,7 @@ def group_cumprod_float64(float64_t[:, :] out, @cython.wraparound(False) def group_cumsum(numeric[:, :] out, numeric[:, :] values, - int64_t[:] labels, + const int64_t[:] labels, is_datetimelike, bint skipna=True): """ @@ -189,7 +187,7 @@ def group_cumsum(numeric[:, :] out, numeric[:, :] accum int64_t lab - N, K = ( values).shape + N, K = (values).shape accum = np.zeros_like(values) with nogil: @@ -217,7 +215,7 @@ def group_cumsum(numeric[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, +def group_shift_indexer(int64_t[:] out, const int64_t[:] labels, int ngroups, int periods): cdef: Py_ssize_t N, i, j, ii @@ -226,7 +224,7 @@ def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) int64_t[:, :] label_indexer - N, = ( labels).shape + N, = (labels).shape if periods < 0: periods = -periods @@ -291,7 +289,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, """ cdef: Py_ssize_t i, N - ndarray[int64_t] sorted_labels + int64_t[:] sorted_labels int64_t idx, curr_fill_idx=-1, filled_vals=0 N = len(out) @@ -327,10 +325,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) -def group_any_all(ndarray[uint8_t] out, - ndarray[int64_t] labels, - ndarray[uint8_t] values, - ndarray[uint8_t] mask, +def group_any_all(uint8_t[:] out, + const int64_t[:] labels, + const uint8_t[:] values, + const uint8_t[:] mask, object val_test, bint skipna): """Aggregated boolean values to show truthfulness of group elements @@ -353,7 +351,7 @@ def group_any_all(ndarray[uint8_t] out, The returned values will either be 0 or 1 (False or True, respectively). """ cdef: - Py_ssize_t i, N=len(labels) + Py_ssize_t i, N = len(labels) int64_t lab uint8_t flag_val @@ -370,7 +368,7 @@ def group_any_all(ndarray[uint8_t] out, else: raise ValueError("'bool_func' must be either 'any' or 'all'!") - out.fill(1 - flag_val) + out[:] = 1 - flag_val with nogil: for i in range(N): diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 765381d89705d..abac9f147848e 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -5,35 +5,31 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" + float64_t NAN "NPY_NAN" _int64_max = np.iinfo(np.int64).max -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # group_add, group_prod, group_var, group_mean, group_ohlc -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: -# name, c_type, dest_type, dest_dtype -dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), - ('float32', 'float32_t', 'float32_t', 'np.float32')] +# name, c_type +dtypes = [('float64', 'float64_t'), + ('float32', 'float32_t')] def get_dispatch(dtypes): - for name, c_type, dest_type, dest_dtype in dtypes: - - dest_type2 = dest_type - dest_type = dest_type.replace('_t', '') - - yield name, c_type, dest_type, dest_type2, dest_dtype + for name, c_type in dtypes: + yield name, c_type }} -{{for name, c_type, dest_type, dest_type2, dest_dtype in get_dispatch(dtypes)}} +{{for name, c_type in get_dispatch(dtypes)}} @cython.wraparound(False) @cython.boundscheck(False) -def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, @@ -43,8 +39,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count - ndarray[{{dest_type2}}, ndim=2] sumx, nobs + {{c_type}} val, count + ndarray[{{c_type}}, ndim=2] sumx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -52,7 +48,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, nobs = np.zeros_like(out) sumx = np.zeros_like(out) - N, K = ( values).shape + N, K = (values).shape with nogil: @@ -80,7 +76,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, @@ -90,8 +86,8 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count - ndarray[{{dest_type2}}, ndim=2] prodx, nobs + {{c_type}} val, count + ndarray[{{c_type}}, ndim=2] prodx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -99,7 +95,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, nobs = np.zeros_like(out) prodx = np.ones_like(out) - N, K = ( values).shape + N, K = (values).shape with nogil: for i in range(N): @@ -127,15 +123,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out, ndarray[int64_t] counts, - ndarray[{{dest_type2}}, ndim=2] values, + ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, ct, oldmean - ndarray[{{dest_type2}}, ndim=2] nobs, mean + {{c_type}} val, ct, oldmean + ndarray[{{c_type}}, ndim=2] nobs, mean assert min_count == -1, "'min_count' only used in add and prod" @@ -145,7 +141,7 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, nobs = np.zeros_like(out) mean = np.zeros_like(out) - N, K = ( values).shape + N, K = (values).shape out[:, :] = 0.0 @@ -179,15 +175,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out, ndarray[int64_t] counts, - ndarray[{{dest_type2}}, ndim=2] values, + ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count - ndarray[{{dest_type2}}, ndim=2] sumx, nobs + {{c_type}} val, count + ndarray[{{c_type}}, ndim=2] sumx, nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -197,7 +193,7 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, nobs = np.zeros_like(out) sumx = np.zeros_like(out) - N, K = ( values).shape + N, K = (values).shape with nogil: for i in range(N): @@ -224,9 +220,9 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out, ndarray[int64_t] counts, - ndarray[{{dest_type2}}, ndim=2] values, + ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, Py_ssize_t min_count=-1): """ @@ -234,7 +230,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab - {{dest_type2}} val, count + {{c_type}} val, count Py_ssize_t ngroups = len(counts) assert min_count == -1, "'min_count' only used in add and prod" @@ -242,7 +238,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, if len(labels) == 0: return - N, K = ( values).shape + N, K = (values).shape if out.shape[1] != 4: raise ValueError('Output array must have 4 columns') @@ -250,7 +246,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, if K > 1: raise NotImplementedError("Argument 'values' must have only " "one dimension") - out.fill(np.nan) + out[:] = np.nan with nogil: for i in range(N): @@ -272,32 +268,32 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{endfor}} -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # group_nth, group_last, group_rank -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: -# name, c_type, dest_type2, nan_val -dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), - ('float32', 'float32_t', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'int64_t', 'iNaT'), - ('object', 'object', 'object', 'NAN')] +# name, c_type, nan_val +dtypes = [('float64', 'float64_t', 'NAN'), + ('float32', 'float32_t', 'NAN'), + ('int64', 'int64_t', 'NPY_NAT'), + ('object', 'object', 'NAN')] def get_dispatch(dtypes): - for name, c_type, dest_type2, nan_val in dtypes: + for name, c_type, nan_val in dtypes: - yield name, c_type, dest_type2, nan_val + yield name, c_type, nan_val }} -{{for name, c_type, dest_type2, nan_val in get_dispatch(dtypes)}} +{{for name, c_type, nan_val in get_dispatch(dtypes)}} @cython.wraparound(False) @cython.boundscheck(False) -def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, @@ -307,8 +303,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val - ndarray[{{dest_type2}}, ndim=2] resx + {{c_type}} val + ndarray[{{c_type}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -316,14 +312,14 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros(( out).shape, dtype=np.int64) - {{if name=='object'}} - resx = np.empty(( out).shape, dtype=object) + nobs = np.zeros((out).shape, dtype=np.int64) + {{if name == 'object'}} + resx = np.empty((out).shape, dtype=object) {{else}} resx = np.empty_like(out) {{endif}} - N, K = ( values).shape + N, K = (values).shape {{if name == "object"}} if True: # make templating happy @@ -351,9 +347,10 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.wraparound(False) @cython.boundscheck(False) -def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, int64_t rank, @@ -363,8 +360,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val - ndarray[{{dest_type2}}, ndim=2] resx + {{c_type}} val + ndarray[{{c_type}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -372,14 +369,14 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros(( out).shape, dtype=np.int64) + nobs = np.zeros((out).shape, dtype=np.int64) {{if name=='object'}} - resx = np.empty(( out).shape, dtype=object) + resx = np.empty((out).shape, dtype=object) {{else}} resx = np.empty_like(out) {{endif}} - N, K = ( values).shape + N, K = (values).shape {{if name == "object"}} if True: # make templating happy @@ -410,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{if name != 'object'}} + + @cython.boundscheck(False) @cython.wraparound(False) def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, @@ -463,14 +462,14 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' - N, K = ( values).shape + N, K = (values).shape grp_sizes = np.ones_like(out) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array masked_vals = np.array(values[:, 0], copy=True) - {{if name=='int64'}} + {{if name == 'int64'}} mask = (masked_vals == {{nan_val}}).astype(np.uint8) {{else}} mask = np.isnan(masked_vals).astype(np.uint8) @@ -528,7 +527,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # to the result where appropriate if keep_na and mask[_as[i]]: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = nan + out[_as[j], 0] = NaN grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -588,47 +587,37 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # rankings, so we assign them percentages of NaN. if out[i, 0] != out[i, 0] or out[i, 0] == NAN: out[i, 0] = NAN - else: + elif grp_sizes[i, 0] != 0: out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # group_min, group_max -#---------------------------------------------------------------------- - -{{py: +# ---------------------------------------------------------------------- -# name, c_type, dest_type2, nan_val -dtypes = [('float64', 'float64_t', 'NAN', 'np.inf'), - ('float32', 'float32_t', 'NAN', 'np.inf'), - ('int64', 'int64_t', 'iNaT', '_int64_max')] - -def get_dispatch(dtypes): - - for name, dest_type2, nan_val, inf_val in dtypes: - yield name, dest_type2, nan_val, inf_val -}} - - -{{for name, dest_type2, nan_val, inf_val in get_dispatch(dtypes)}} +# TODO: consider implementing for more dtypes +ctypedef fused groupby_t: + float64_t + float32_t + int64_t @cython.wraparound(False) @cython.boundscheck(False) -def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, - ndarray[int64_t] counts, - ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): +def group_max(ndarray[groupby_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[groupby_t, ndim=2] values, + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count - ndarray[{{dest_type2}}, ndim=2] maxx, nobs + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] maxx, nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -638,9 +627,15 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, nobs = np.zeros_like(out) maxx = np.empty_like(out) - maxx.fill(-{{inf_val}}) + if groupby_t is int64_t: + # Note: evaluated at compile-time + maxx[:] = -_int64_max + nan_val = NPY_NAT + else: + maxx[:] = -np.inf + nan_val = NAN - N, K = ( values).shape + N, K = (values).shape with nogil: for i in range(N): @@ -653,37 +648,39 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} - if val == val and val != {{nan_val}}: - {{endif}} - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val + if groupby_t is int64_t: + if val != nan_val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + if val == val and val != nan_val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = {{nan_val}} + out[i, j] = nan_val else: out[i, j] = maxx[i, j] @cython.wraparound(False) @cython.boundscheck(False) -def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, - ndarray[int64_t] counts, - ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): +def group_min(ndarray[groupby_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[groupby_t, ndim=2] values, + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count - ndarray[{{dest_type2}}, ndim=2] minx, nobs + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] minx, nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -693,9 +690,14 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, nobs = np.zeros_like(out) minx = np.empty_like(out) - minx.fill({{inf_val}}) + if groupby_t is int64_t: + minx[:] = _int64_max + nan_val = NPY_NAT + else: + minx[:] = np.inf + nan_val = NAN - N, K = ( values).shape + N, K = (values).shape with nogil: for i in range(N): @@ -708,41 +710,46 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} - if val == val and val != {{nan_val}}: - {{endif}} - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val + if groupby_t is int64_t: + if val != nan_val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + if val == val and val != nan_val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = {{nan_val}} + out[i, j] = nan_val else: out[i, j] = minx[i, j] @cython.boundscheck(False) @cython.wraparound(False) -def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, - ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels, - bint is_datetimelike): +def group_cummin(ndarray[groupby_t, ndim=2] out, + ndarray[groupby_t, ndim=2] values, + ndarray[int64_t] labels, + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size - {{dest_type2}} val, mval - ndarray[{{dest_type2}}, ndim=2] accum + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum int64_t lab - N, K = ( values).shape + N, K = (values).shape accum = np.empty_like(values) - accum.fill({{inf_val}}) + if groupby_t is int64_t: + accum[:] = _int64_max + else: + accum[:] = np.inf with nogil: for i in range(N): @@ -754,37 +761,43 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # val = nan - {{if name == 'int64'}} - if is_datetimelike and val == {{nan_val}}: - out[i, j] = {{nan_val}} + if groupby_t is int64_t: + if is_datetimelike and val == NPY_NAT: + out[i, j] = NPY_NAT + else: + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval else: - {{else}} - if val == val: - {{endif}} - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + if val == val: + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval @cython.boundscheck(False) @cython.wraparound(False) -def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, - ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels, - bint is_datetimelike): +def group_cummax(ndarray[groupby_t, ndim=2] out, + ndarray[groupby_t, ndim=2] values, + ndarray[int64_t] labels, + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size - {{dest_type2}} val, mval - ndarray[{{dest_type2}}, ndim=2] accum + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum int64_t lab - N, K = ( values).shape + N, K = (values).shape accum = np.empty_like(values) - accum.fill(-{{inf_val}}) + if groupby_t is int64_t: + accum[:] = -_int64_max + else: + accum[:] = -np.inf with nogil: for i in range(N): @@ -795,16 +808,17 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for j in range(K): val = values[i, j] - {{if name == 'int64'}} - if is_datetimelike and val == {{nan_val}}: - out[i, j] = {{nan_val}} + if groupby_t is int64_t: + if is_datetimelike and val == NPY_NAT: + out[i, j] = NPY_NAT + else: + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval else: - {{else}} - if val == val: - {{endif}} - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval - -{{endfor}} + if val == val: + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c2305c8f3ff00..6e66693decc01 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -54,8 +54,8 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): n = len(arr) # create an array of bytes - vecs = malloc(n * sizeof(char *)) - lens = malloc(n * sizeof(uint64_t)) + vecs = malloc(n * sizeof(char *)) + lens = malloc(n * sizeof(uint64_t)) for i in range(n): val = arr[i] diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 2ced98198afc6..9aa887727a765 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -2,20 +2,18 @@ cimport cython -from cpython cimport (PyObject, Py_INCREF, PyList_Check, PyTuple_Check, - PyMem_Malloc, PyMem_Realloc, PyMem_Free, - PyString_Check, PyBytes_Check, - PyUnicode_Check) +from cpython cimport (PyObject, Py_INCREF, + PyMem_Malloc, PyMem_Realloc, PyMem_Free) from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint32_t +from numpy cimport ndarray, uint8_t, uint32_t, float64_t cnp.import_array() cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" + float64_t NAN "NPY_NAN" from khash cimport ( @@ -44,9 +42,7 @@ cimport util from missing cimport checknull -nan = np.nan - -cdef int64_t iNaT = util.get_nat() +cdef int64_t NPY_NAT = util.get_nat() _SIZE_HINT_LIMIT = (1 << 20) + 7 @@ -153,7 +149,7 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels): cdef: int ret = 0 Py_ssize_t i, n = len(labels) - kh_int64_t * table = kh_init_int64() + kh_int64_t *table = kh_init_int64() Int64Vector idx = Int64Vector() ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f294fd141a9f1..7f4c2a6410870 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,9 +5,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # VectorData -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -53,9 +53,9 @@ ctypedef fused vector_data: cdef inline bint needs_resize(vector_data *data) nogil: return data.n == data.m -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Vector -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -86,12 +86,12 @@ cdef class {{name}}Vector: self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype={{idtype}}) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*>self.ao.data cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.ao.resize(self.data.m, refcheck=False) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*>self.ao.data def __dealloc__(self): if self.data is not NULL: @@ -134,14 +134,13 @@ cdef class StringVector: bint external_view_exists def __cinit__(self): - self.data = PyMem_Malloc( - sizeof(StringVectorData)) + self.data = PyMem_Malloc(sizeof(StringVectorData)) if not self.data: raise MemoryError() self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP - self.data.data = malloc(self.data.m * sizeof(char *)) + self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() @@ -154,7 +153,7 @@ cdef class StringVector: self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) orig_data = self.data.data - self.data.data = malloc(self.data.m * sizeof(char *)) + self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() for i in range(m): @@ -184,7 +183,7 @@ cdef class StringVector: self.data.m = self.data.n return ao - cdef inline void append(self, char * x): + cdef inline void append(self, char *x): if needs_resize(self.data): self.resize() @@ -209,22 +208,22 @@ cdef class ObjectVector: self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) - self.data = self.ao.data + self.data = self.ao.data def __len__(self): return self.n - cdef inline append(self, object o): + cdef inline append(self, object obj): if self.n == self.m: if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.m = max(self.m * 2, _INIT_VEC_CAP) self.ao.resize(self.m, refcheck=False) - self.data = self.ao.data + self.data = self.ao.data - Py_INCREF(o) - self.data[self.n] = o + Py_INCREF(obj) + self.data[self.n] = obj self.n += 1 def to_array(self): @@ -240,9 +239,9 @@ cdef class ObjectVector: for i in range(len(x)): self.append(x[i]) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # HashTable -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- cdef class HashTable: @@ -252,9 +251,9 @@ cdef class HashTable: {{py: # name, dtype, float_group, default_na_value -dtypes = [('Float64', 'float64', True, 'nan'), +dtypes = [('Float64', 'float64', True, 'np.nan'), ('UInt64', 'uint64', False, 0), - ('Int64', 'int64', False, 'iNaT')] + ('Int64', 'int64', False, 'NPY_NAT')] }} @@ -283,9 +282,9 @@ cdef class {{name}}HashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(Py_ssize_t) + # vals + sizeof(uint32_t)) # flags cpdef get_item(self, {{dtype}}_t val): cdef khiter_t k @@ -319,7 +318,7 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): key = keys[i] k = kh_put_{{dtype}}(self.table, key, &ret) - self.table.vals[k] = values[i] + self.table.vals[k] = values[i] @cython.boundscheck(False) def map_locations(self, ndarray[{{dtype}}_t, ndim=1] values): @@ -355,26 +354,56 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) - def factorize(self, {{dtype}}_t values): - uniques = {{name}}Vector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels - @cython.boundscheck(False) - def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value=None): + @cython.wraparound(False) + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + uniques : {{name}}Vector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse=True) + The labels from values to uniques + """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels - Py_ssize_t idx, count = count_prior int ret = 0 {{dtype}}_t val, na_value2 khiter_t k {{name}}VectorData *ud bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -392,18 +421,19 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if val != val or (use_na_value and val == na_value2): + if ignore_na and (val != val + or (use_na_value and val == na_value2)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, + # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: + # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - self.table.vals[k] = count if needs_resize(ud): with gil: @@ -413,10 +443,82 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx - return np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = {{name}}Vector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = {{name}}Vector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) + + def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -463,30 +565,6 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques - @cython.boundscheck(False) - def unique(self, const {{dtype}}_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - return uniques.to_array() - {{endfor}} @@ -543,13 +621,13 @@ cdef class StringHashTable(HashTable): cdef: Py_ssize_t i, n = len(values) ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - int64_t *resbuf = labels.data + int64_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table const char *v const char **vecs - vecs = malloc(n * sizeof(char *)) + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] v = util.get_c_string(val) @@ -566,47 +644,6 @@ cdef class StringHashTable(HashTable): free(vecs) return labels - @cython.boundscheck(False) - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, count, n = len(values) - int64_t[:] uindexer - int ret = 0 - object val - ObjectVector uniques - khiter_t k - const char *v - const char **vecs - - vecs = malloc(n * sizeof(char *)) - uindexer = np.empty(n, dtype=np.int64) - for i in range(n): - val = values[i] - v = util.get_c_string(val) - vecs[i] = v - - count = 0 - with nogil: - for i in range(n): - v = vecs[i] - k = kh_get_str(self.table, v) - if k == self.table.n_buckets: - kh_put_str(self.table, v, &ret) - uindexer[count] = i - count += 1 - free(vecs) - - # uniques - uniques = ObjectVector() - for i in range(count): - uniques.append(values[uindexer[i]]) - return uniques.to_array() - - def factorize(self, ndarray[object] values): - uniques = ObjectVector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels - @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: @@ -618,11 +655,11 @@ cdef class StringHashTable(HashTable): int64_t[:] locs = np.empty(n, dtype=np.int64) # these by-definition *must* be strings - vecs = malloc(n * sizeof(char *)) + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if PyUnicode_Check(val) or PyString_Check(val): + if isinstance(val, (str, unicode)): v = util.get_c_string(val) else: v = util.get_c_string(self.na_string_sentinel) @@ -651,11 +688,11 @@ cdef class StringHashTable(HashTable): khiter_t k # these by-definition *must* be strings - vecs = malloc(n * sizeof(char *)) + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if PyUnicode_Check(val) or PyString_Check(val): + if isinstance(val, (str, unicode)): v = util.get_c_string(val) else: v = util.get_c_string(self.na_string_sentinel) @@ -669,14 +706,48 @@ cdef class StringHashTable(HashTable): free(vecs) @cython.boundscheck(False) - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - object na_value=None): + @cython.wraparound(False) + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse=True) + The labels from values to uniques + """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int64_t[:] uindexer - Py_ssize_t idx, count = count_prior int ret = 0 object val const char *v @@ -684,41 +755,50 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - # these by-definition *must* be strings - labels = np.zeros(n, dtype=np.int64) + if return_inverse: + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None - # pre-filter out missing - # and assign pointers - vecs = malloc(n * sizeof(char *)) + # assign pointers and pre-filter out missing (if ignore_na) + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if ((PyUnicode_Check(val) or PyString_Check(val)) and - not (use_na_value and val == na_value)): + if (ignore_na + and (not isinstance(val, (str, unicode)) + or (use_na_value and val == na_value))): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), we can skip the actual value, and + # replace the label with na_sentinel directly + labels[i] = na_sentinel + else: + # if ignore_na is False, we also stringify NaN/None/etc. v = util.get_c_string(val) vecs[i] = v - else: - labels[i] = na_sentinel # compute with nogil: for i in range(n): - if labels[i] == na_sentinel: + if ignore_na and labels[i] == na_sentinel: + # skip entries for ignored missing values (see above) continue v = vecs[i] k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: + # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) - self.table.vals[k] = count uindexer[count] = i - labels[i] = count + if return_inverse: + self.table.vals[k] = count + labels[i] = count count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx free(vecs) @@ -726,7 +806,72 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - return np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels cdef class PyObjectHashTable(HashTable): @@ -752,9 +897,9 @@ cdef class PyObjectHashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof(PyObject *) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + return self.table.n_buckets * (sizeof(PyObject *) + # keys + sizeof(Py_ssize_t) + # vals + sizeof(uint32_t)) # flags cpdef get_item(self, object val): cdef khiter_t k @@ -814,57 +959,147 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - hash(val) - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - - return uniques.to_array() - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - object na_value=None): + @cython.boundscheck(False) + @cython.wraparound(False) + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse=True) + The labels from values to uniques + """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels - Py_ssize_t idx, count = count_prior int ret = 0 object val khiter_t k bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): val = values[i] hash(val) - if ((val != val or val is None) or - (use_na_value and val == na_value)): + if ignore_na and ((val != val or val is None) + or (use_na_value and val == na_value)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, and + # replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: + # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count uniques.append(val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) - return np.asarray(labels) + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 45a69b613f698..80d864c65d087 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -4,9 +4,9 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # VectorData -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -45,11 +45,11 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, val = values[i] if not checknull(val) or not dropna: - k = kh_get_{{ttype}}(table, val) + k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{ttype}}(table, val, &ret) + k = kh_put_{{ttype}}(table, val, &ret) table.vals[k] = 1 {{else}} with nogil: @@ -80,7 +80,7 @@ cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): {{endif}} cdef: - Py_ssize_t i=0 + Py_ssize_t i = 0 kh_{{ttype}}_t *table {{if dtype != 'object'}} @@ -103,7 +103,7 @@ cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): {{if dtype == 'object'}} for k in range(table.n_buckets): if kh_exist_{{ttype}}(table, k): - result_keys[i] = <{{dtype}}> table.keys[k] + result_keys[i] = <{{dtype}}>table.keys[k] result_counts[i] = table.vals[k] i += 1 {{else}} @@ -128,6 +128,7 @@ cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): @cython.boundscheck(False) {{if dtype == 'object'}} + def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): {{else}} @@ -140,7 +141,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): {{dtype}}_t value {{endif}} Py_ssize_t k, i, n = len(values) - kh_{{ttype}}_t * table = kh_init_{{ttype}}() + kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') kh_resize_{{ttype}}(table, min(n, _SIZE_HINT_LIMIT)) @@ -151,7 +152,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} for i from n > i >= 0: - kh_put_{{ttype}}(table, values[i], &ret) + kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: @@ -162,7 +163,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): elif keep == 'first': {{if dtype == 'object'}} for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: @@ -174,13 +175,13 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): {{if dtype == 'object'}} for i in range(n): value = values[i] - k = kh_get_{{ttype}}(table, value) + k = kh_get_{{ttype}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 out[i] = 1 else: - k = kh_put_{{ttype}}(table, value, &ret) - table.keys[k] = value + k = kh_put_{{ttype}}(table, value, &ret) + table.keys[k] = value table.vals[k] = i out[i] = 0 {{else}} @@ -201,18 +202,20 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): return out -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Membership -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} + def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values): {{else}} + def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{endif}} @@ -234,8 +237,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): int ret = 0 ndarray[uint8_t] result {{scalar}} val - kh_{{ttype}}_t * table = kh_init_{{ttype}}() - + kh_{{ttype}}_t *table = kh_init_{{ttype}}() # construct the table n = len(values) @@ -243,7 +245,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{if dtype == 'object'}} for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + kh_put_{{ttype}}(table, values[i], &ret) {{else}} with nogil: for i in range(n): @@ -257,7 +259,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{if dtype == 'object'}} for i in range(n): val = arr[i] - k = kh_get_{{ttype}}(table, val) + k = kh_get_{{ttype}}(table, val) result[i] = (k != table.n_buckets) {{else}} with nogil: @@ -273,9 +275,9 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{endfor}} -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Mode Computations -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -303,17 +305,13 @@ def mode_{{dtype}}({{ctype}}[:] values, bint dropna): {{endif}} cdef: int count, max_count = 1 - int j = -1 # so you can do += + int j = -1 # so you can do += Py_ssize_t k kh_{{table_type}}_t *table ndarray[{{ctype}}] modes table = kh_init_{{table_type}}() - {{if dtype == 'object'}} - build_count_table_{{dtype}}(values, table, dropna) - {{else}} build_count_table_{{dtype}}(values, table, dropna) - {{endif}} modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}}) @@ -344,7 +342,7 @@ def mode_{{dtype}}({{ctype}}[:] values, bint dropna): else: continue - modes[j] = table.keys[k] + modes[j] = table.keys[k] {{endif}} kh_destroy_{{table_type}}(table) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 3f76915655f58..d828c3dd8e923 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -5,8 +5,10 @@ import cython import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, float64_t, int32_t, - int64_t, uint8_t, uint64_t, intp_t, +from numpy cimport (ndarray, intp_t, + float64_t, float32_t, + int64_t, int32_t, int16_t, int8_t, + uint64_t, uint32_t, uint16_t, uint8_t, # Note: NPY_DATETIME, NPY_TIMEDELTA are only available # for cimport in cython>=0.27.3 NPY_DATETIME, NPY_TIMEDELTA) @@ -23,7 +25,7 @@ from pandas._libs import algos, hashtable as _hash from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib from pandas._libs.missing import checknull -cdef int64_t iNaT = util.get_nat() +cdef int64_t NPY_NAT = util.get_nat() cdef inline bint is_definitely_invalid_key(object val): @@ -111,6 +113,8 @@ cdef class IndexEngine: if not self.is_unique: return self._get_loc_duplicates(val) values = self._get_index_values() + + self._check_type(val) loc = _bin_search(values, val) # .searchsorted(val, side='left') if loc >= len(values): raise KeyError(val) @@ -518,7 +522,7 @@ cpdef convert_scalar(ndarray arr, object value): elif isinstance(value, (datetime, np.datetime64, date)): return Timestamp(value).value elif value is None or value != value: - return iNaT + return NPY_NAT elif util.is_string_object(value): return Timestamp(value).value raise ValueError("cannot set a Timestamp with a non-timestamp") @@ -529,7 +533,7 @@ cpdef convert_scalar(ndarray arr, object value): elif isinstance(value, timedelta): return Timedelta(value).value elif value is None or value != value: - return iNaT + return NPY_NAT elif util.is_string_object(value): return Timedelta(value).value raise ValueError("cannot set a Timedelta with a non-timedelta") diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 4ea35da0626f3..b393283bfd4ca 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -4,20 +4,28 @@ Template for functions of IndexEngine subclasses. WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # IndexEngine Subclass Methods -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: -# name, dtype, ctype -dtypes = [('Float64', 'float64', 'float64_t'), - ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t'), - ('Object', 'object', 'object')] +# name, dtype, ctype, hashtable_name, hashtable_dtype +dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'), + ('Float32', 'float32', 'float32_t', 'Float64', 'float64'), + ('Int64', 'int64', 'int64_t', 'Int64', 'int64'), + ('Int32', 'int32', 'int32_t', 'Int64', 'int64'), + ('Int16', 'int16', 'int16_t', 'Int64', 'int64'), + ('Int8', 'int8', 'int8_t', 'Int64', 'int64'), + ('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'), + ('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'), + ('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'), + ('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'), + ('Object', 'object', 'object', 'PyObject', 'object'), + ] }} -{{for name, dtype, ctype in dtypes}} +{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}} cdef class {{name}}Engine(IndexEngine): @@ -34,22 +42,25 @@ cdef class {{name}}Engine(IndexEngine): other, limit=limit) cdef _make_hash_table(self, n): - {{if name == 'Object'}} - return _hash.PyObjectHashTable(n) - {{else}} - return _hash.{{name}}HashTable(n) - {{endif}} + return _hash.{{hashtable_name}}HashTable(n) - {{if name != 'Float64' and name != 'Object'}} + {{if name not in {'Float64', 'Float32', 'Object'} }} cdef _check_type(self, object val): hash(val) if util.is_bool_object(val): raise KeyError(val) elif util.is_float_object(val): raise KeyError(val) + elif not util.is_integer_object(val): + raise KeyError(val) {{endif}} {{if name != 'Object'}} + cpdef _call_map_locations(self, values): + # self.mapping is of type {{hashtable_name}}HashTable, + # so convert dtype of values + self.mapping.map_locations(algos.ensure_{{hashtable_dtype}}(values)) + cdef _get_index_values(self): return algos.ensure_{{dtype}}(self.vgetter()) @@ -60,7 +71,7 @@ cdef class {{name}}Engine(IndexEngine): ndarray[{{ctype}}] values int count = 0 - {{if name != 'Float64'}} + {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) {{endif}} diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 82261094022fb..dae88d3b707bf 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,19 +1,27 @@ # -*- coding: utf-8 -*- import numbers +from operator import le, lt from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, PyObject_RichCompare) -cimport cython -from cython cimport Py_ssize_t +import cython +from cython import Py_ssize_t import numpy as np -from numpy cimport ndarray +cimport numpy as cnp +from numpy cimport ( + int64_t, int32_t, float64_t, float32_t, uint64_t, + ndarray, + PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take) +cnp.import_array() cimport util util.import_array() +from hashtable cimport Int64Vector, Int64VectorData + from tslibs import Timestamp from tslibs.timezones cimport tz_compare @@ -359,6 +367,67 @@ cdef class Interval(IntervalMixin): self.left // y, self.right // y, closed=self.closed) return NotImplemented + def overlaps(self, other): + """ + Check whether two Interval objects overlap. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + other : Interval + The interval to check against for an overlap. + + Returns + ------- + bool + ``True`` if the two intervals overlap, else ``False``. + + Examples + -------- + >>> i1 = pd.Interval(0, 2) + >>> i2 = pd.Interval(1, 3) + >>> i1.overlaps(i2) + True + >>> i3 = pd.Interval(4, 5) + >>> i1.overlaps(i3) + False + + Intervals that share closed endpoints overlap: + + >>> i4 = pd.Interval(0, 1, closed='both') + >>> i5 = pd.Interval(1, 2, closed='both') + >>> i4.overlaps(i5) + True + + Intervals that only have an open endpoint in common do not overlap: + + >>> i6 = pd.Interval(1, 2, closed='neither') + >>> i4.overlaps(i6) + False + + See Also + -------- + IntervalArray.overlaps : The corresponding method for IntervalArray + IntervalIndex.overlaps : The corresponding method for IntervalIndex + """ + if not isinstance(other, Interval): + msg = '`other` must be an Interval, got {other}' + raise TypeError(msg.format(other=type(other).__name__)) + + # equality is okay if both endpoints are closed (overlap at a point) + op1 = le if (self.closed_left and other.closed_right) else lt + op2 = le if (other.closed_left and self.closed_right) else lt + + # overlaps is equivalent negation of two interval being disjoint: + # disjoint = (A.left > B.right) or (B.left > A.right) + # (simplifying the negation allows this to be done in less operations) + return op1(self.left, other.right) and op2(other.left, self.right) + @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 9ed76242a95c3..fb6f30c030f11 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -4,21 +4,6 @@ Template for intervaltree WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -from numpy cimport ( - int64_t, int32_t, float64_t, float32_t, uint64_t, - ndarray, - PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take) -import numpy as np - -cimport cython -from cython cimport Py_ssize_t - -cimport numpy as cnp -cnp.import_array() - -from hashtable cimport Int64Vector, Int64VectorData - - ctypedef fused scalar_t: float64_t float32_t @@ -26,10 +11,9 @@ ctypedef fused scalar_t: int32_t uint64_t - -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # IntervalTree -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- cdef class IntervalTree(IntervalMixin): """A centered interval tree @@ -42,7 +26,7 @@ cdef class IntervalTree(IntervalMixin): cdef: readonly object left, right, root, dtype readonly str closed - object _left_sorter, _right_sorter + object _is_overlapping, _left_sorter, _right_sorter def __init__(self, left, right, closed='right', leaf_size=100): """ @@ -72,6 +56,12 @@ cdef class IntervalTree(IntervalMixin): self.closed = closed + # GH 23352: ensure no nan in nodes + mask = ~np.isnan(self.left) + self.left = self.left[mask] + self.right = self.right[mask] + indices = indices[mask] + node_cls = NODE_CLASSES[str(self.dtype), closed] self.root = node_cls(self.left, self.right, indices, leaf_size) @@ -91,6 +81,26 @@ cdef class IntervalTree(IntervalMixin): self._right_sorter = np.argsort(self.right) return self._right_sorter + @property + def is_overlapping(self): + """ + Determine if the IntervalTree contains overlapping intervals. + Cached as self._is_overlapping. + """ + if self._is_overlapping is not None: + return self._is_overlapping + + # <= when both sides closed since endpoints can overlap + op = le if self.closed == 'both' else lt + + # overlap if start of current interval < end of previous interval + # (current and previous in terms of sorted order by left/start side) + current = self.left[self.left_sorter[1:]] + previous = self.right[self.left_sorter[:-1]] + self._is_overlapping = bool(op(current, previous).any()) + + return self._is_overlapping + def get_loc(self, scalar_t key): """Return all positions corresponding to intervals that overlap with the given scalar key @@ -99,7 +109,7 @@ cdef class IntervalTree(IntervalMixin): self.root.query(result, key) if not result.data.n: raise KeyError(key) - return result.to_array() + return result.to_array().astype('intp') def _get_partial_overlap(self, key_left, key_right, side): """Return all positions corresponding to intervals with the given side @@ -126,7 +136,7 @@ cdef class IntervalTree(IntervalMixin): enclosing = self.get_loc(0.5 * (key_left + key_right)) combined = np.concatenate([left_overlap, right_overlap, enclosing]) uniques = pd.unique(combined) - return uniques + return uniques.astype('intp') def get_indexer(self, scalar_t[:] target): """Return the positions corresponding to unique intervals that overlap @@ -149,7 +159,7 @@ cdef class IntervalTree(IntervalMixin): raise KeyError( 'indexer does not intersect a unique set of intervals') old_len = result.data.n - return result.to_array() + return result.to_array().astype('intp') def get_indexer_non_unique(self, scalar_t[:] target): """Return the positions corresponding to intervals that overlap with @@ -169,7 +179,8 @@ cdef class IntervalTree(IntervalMixin): result.append(-1) missing.append(i) old_len = result.data.n - return result.to_array(), missing.to_array() + return (result.to_array().astype('intp'), + missing.to_array().astype('intp')) def __repr__(self): return (' np.NaN -cdef double nan = NaN - from pandas._libs.algos import groupsort_indexer, ensure_platform_int from pandas.core.algorithms import take_nd -include "join_func_helper.pxi" - def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): @@ -214,7 +208,7 @@ def _get_result_indexer(sorter, indexer): else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) - res.fill(-1) + res[:] = -1 return res @@ -239,4 +233,774 @@ def ffill_indexer(ndarray[int64_t] indexer): return result -include "join_helper.pxi" +# ---------------------------------------------------------------------- +# left_join_indexer, inner_join_indexer, outer_join_indexer +# ---------------------------------------------------------------------- + +ctypedef fused join_t: + float64_t + float32_t + object + int32_t + int64_t + uint64_t + + +# Joins on ordered, unique indices + +# right might contain non-unique values + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + join_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"] +left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"] +left_join_indexer_unique_object = left_join_indexer_unique["object"] +left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"] +left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"] +left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + join_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[join_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=left.dtype) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +left_join_indexer_float64 = left_join_indexer["float64_t"] +left_join_indexer_float32 = left_join_indexer["float32_t"] +left_join_indexer_object = left_join_indexer["object"] +left_join_indexer_int32 = left_join_indexer["int32_t"] +left_join_indexer_int64 = left_join_indexer["int64_t"] +left_join_indexer_uint64 = left_join_indexer["uint64_t"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + join_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[join_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=left.dtype) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +inner_join_indexer_float64 = inner_join_indexer["float64_t"] +inner_join_indexer_float32 = inner_join_indexer["float32_t"] +inner_join_indexer_object = inner_join_indexer["object"] +inner_join_indexer_int32 = inner_join_indexer["int32_t"] +inner_join_indexer_int64 = inner_join_indexer["int64_t"] +inner_join_indexer_uint64 = inner_join_indexer["uint64_t"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + join_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[join_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=left.dtype) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + + +outer_join_indexer_float64 = outer_join_indexer["float64_t"] +outer_join_indexer_float32 = outer_join_indexer["float32_t"] +outer_join_indexer_object = outer_join_indexer["object"] +outer_join_indexer_int32 = outer_join_indexer["int32_t"] +outer_join_indexer_int64 = outer_join_indexer["int64_t"] +outer_join_indexer_uint64 = outer_join_indexer["uint64_t"] + + +# ---------------------------------------------------------------------- +# asof_join_by +# ---------------------------------------------------------------------- + +from hashtable cimport ( + HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable) + +ctypedef fused asof_t: + uint8_t + uint16_t + uint32_t + uint64_t + int8_t + int16_t + int32_t + int64_t + float + float64_t + +ctypedef fused by_t: + object + int64_t + uint64_t + + +def asof_join_backward_on_X_by_Y(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + ndarray[by_t] left_by_values, + ndarray[by_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + HashTable hash_table + by_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's + if allow_exact_matches: + while (right_pos < right_size and + right_values[right_pos] <= left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while (right_pos < right_size and + right_values[right_pos] < left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_forward_on_X_by_Y(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + ndarray[by_t] left_by_values, + ndarray[by_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + HashTable hash_table + by_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while (right_pos >= 0 and + right_values[right_pos] >= left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + else: + while (right_pos >= 0 and + right_values[right_pos] > left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = right_values[found_right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest_on_X_by_Y(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + ndarray[by_t] left_by_values, + ndarray[by_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + asof_t bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri = asof_join_backward_on_X_by_Y(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + fli, fri = asof_join_forward_on_X_by_Y(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer + + +# ---------------------------------------------------------------------- +# asof_join +# ---------------------------------------------------------------------- + +def asof_join_backward(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's + if allow_exact_matches: + while (right_pos < right_size and + right_values[right_pos] <= left_values[left_pos]): + right_pos += 1 + else: + while (right_pos < right_size and + right_values[right_pos] < left_values[left_pos]): + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_forward(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while (right_pos >= 0 and + right_values[right_pos] >= left_values[left_pos]): + right_pos -= 1 + else: + while (right_pos >= 0 and + right_values[right_pos] > left_values[left_pos]): + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = (right_pos + if right_pos != right_size else -1) + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != right_size: + diff = right_values[right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + asof_t bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri = asof_join_backward(left_values, right_values, + allow_exact_matches, tolerance) + fli, fri = asof_join_forward(left_values, right_values, + allow_exact_matches, tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer diff --git a/pandas/_libs/join_func_helper.pxi.in b/pandas/_libs/join_func_helper.pxi.in deleted file mode 100644 index a72b113a6fdb6..0000000000000 --- a/pandas/_libs/join_func_helper.pxi.in +++ /dev/null @@ -1,374 +0,0 @@ -# cython: boundscheck=False, wraparound=False -""" -Template for each `dtype` helper function for hashtable - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# asof_join_by -#---------------------------------------------------------------------- - -from hashtable cimport PyObjectHashTable, UInt64HashTable, Int64HashTable - -{{py: - -# table_type, by_dtype -by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t'), - ('UInt64HashTable', 'uint64_t')] - -# on_dtype -on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', - 'int8_t', 'int16_t', 'int32_t', 'int64_t', - 'float', 'double'] - -}} - - - -{{for table_type, by_dtype in by_dtypes}} -{{for on_dtype in on_dtypes}} - - -def asof_join_backward_{{on_dtype}}_by_{{by_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - ndarray[{{by_dtype}}] left_by_values, - ndarray[{{by_dtype}}] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - {{on_dtype}} tolerance_ = 0 - {{on_dtype}} diff = 0 - {{table_type}} hash_table - {{by_dtype}} by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = {{table_type}}(right_size) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's - if allow_exact_matches: - while (right_pos < right_size and - right_values[right_pos] <= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - else: - while (right_pos < right_size and - right_values[right_pos] < left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = left_values[left_pos] - right_values[found_right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_forward_{{on_dtype}}_by_{{by_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - ndarray[{{by_dtype}}] left_by_values, - ndarray[{{by_dtype}}] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - {{on_dtype}} tolerance_ = 0 - {{on_dtype}} diff = 0 - {{table_type}} hash_table - {{by_dtype}} by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = {{table_type}}(right_size) - - right_pos = right_size - 1 - for left_pos in range(left_size - 1, -1, -1): - # restart right_pos if it went over in a previous iteration - if right_pos == right_size: - right_pos = right_size - 1 - - # find first position in right whose value is greater than left's - if allow_exact_matches: - while (right_pos >= 0 and - right_values[right_pos] >= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos -= 1 - else: - while (right_pos >= 0 and - right_values[right_pos] > left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos -= 1 - right_pos += 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = right_values[found_right_pos] - left_values[left_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_nearest_{{on_dtype}}_by_{{by_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - ndarray[{{by_dtype}}] left_by_values, - ndarray[{{by_dtype}}] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_size, right_size, i - ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri - {{on_dtype}} bdiff, fdiff - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - # search both forward and backward - bli, bri =\ - asof_join_backward_{{on_dtype}}_by_{{by_dtype}}(left_values, - right_values, - left_by_values, - right_by_values, - allow_exact_matches, - tolerance) - fli, fri =\ - asof_join_forward_{{on_dtype}}_by_{{by_dtype}}(left_values, - right_values, - left_by_values, - right_by_values, - allow_exact_matches, - tolerance) - - for i in range(len(bri)): - # choose timestamp from right with smaller difference - if bri[i] != -1 and fri[i] != -1: - bdiff = left_values[bli[i]] - right_values[bri[i]] - fdiff = right_values[fri[i]] - left_values[fli[i]] - right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] - else: - right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] - left_indexer[i] = bli[i] - - return left_indexer, right_indexer - -{{endfor}} -{{endfor}} - - -#---------------------------------------------------------------------- -# asof_join -#---------------------------------------------------------------------- - -{{py: - -# on_dtype -dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', - 'int8_t', 'int16_t', 'int32_t', 'int64_t', - 'float', 'double'] - -}} - -{{for on_dtype in dtypes}} - - -def asof_join_backward_{{on_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - {{on_dtype}} tolerance_ = 0 - {{on_dtype}} diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's - if allow_exact_matches: - while (right_pos < right_size and - right_values[right_pos] <= left_values[left_pos]): - right_pos += 1 - else: - while (right_pos < right_size and - right_values[right_pos] < left_values[left_pos]): - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = right_pos - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != -1: - diff = left_values[left_pos] - right_values[right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_forward_{{on_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - {{on_dtype}} tolerance_ = 0 - {{on_dtype}} diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - right_pos = right_size - 1 - for left_pos in range(left_size - 1, -1, -1): - # restart right_pos if it went over in a previous iteration - if right_pos == right_size: - right_pos = right_size - 1 - - # find first position in right whose value is greater than left's - if allow_exact_matches: - while (right_pos >= 0 and - right_values[right_pos] >= left_values[left_pos]): - right_pos -= 1 - else: - while (right_pos >= 0 and - right_values[right_pos] > left_values[left_pos]): - right_pos -= 1 - right_pos += 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = (right_pos - if right_pos != right_size else -1) - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != right_size: - diff = right_values[right_pos] - left_values[left_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_nearest_{{on_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_size, right_size, i - ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri - {{on_dtype}} bdiff, fdiff - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - # search both forward and backward - bli, bri = asof_join_backward_{{on_dtype}}(left_values, right_values, - allow_exact_matches, tolerance) - fli, fri = asof_join_forward_{{on_dtype}}(left_values, right_values, - allow_exact_matches, tolerance) - - for i in range(len(bri)): - # choose timestamp from right with smaller difference - if bri[i] != -1 and fri[i] != -1: - bdiff = left_values[bli[i]] - right_values[bri[i]] - fdiff = right_values[fri[i]] - left_values[fli[i]] - right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] - else: - right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] - left_indexer[i] = bli[i] - - return left_indexer, right_indexer - -{{endfor}} diff --git a/pandas/_libs/join_helper.pxi.in b/pandas/_libs/join_helper.pxi.in deleted file mode 100644 index 6ba587a5b04ea..0000000000000 --- a/pandas/_libs/join_helper.pxi.in +++ /dev/null @@ -1,423 +0,0 @@ -""" -Template for each `dtype` helper function for join - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -# ---------------------------------------------------------------------- -# left_join_indexer, inner_join_indexer, outer_join_indexer -# ---------------------------------------------------------------------- - -ctypedef fused join_t: - float64_t - float32_t - object - int32_t - int64_t - uint64_t - - -# Joins on ordered, unique indices - -# right might contain non-unique values - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - join_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"] -left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"] -left_join_indexer_unique_object = left_join_indexer_unique["object"] -left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"] -left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"] -left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"] - - -{{py: - -# name, c_type, dtype -dtypes = [('float64', 'float64_t', 'np.float64'), - ('float32', 'float32_t', 'np.float32'), - ('object', 'object', 'object'), - ('int32', 'int32_t', 'np.int32'), - ('int64', 'int64_t', 'np.int64'), - ('uint64', 'uint64_t', 'np.uint64')] - -def get_dispatch(dtypes): - - for name, c_type, dtype in dtypes: - yield name, c_type, dtype - -}} - -{{for name, c_type, dtype in get_dispatch(dtypes)}} - -# @cython.wraparound(False) -# @cython.boundscheck(False) -def left_join_indexer_{{name}}(ndarray[{{c_type}}] left, - ndarray[{{c_type}}] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - {{c_type}} lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[{{c_type}}] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype={{dtype}}) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_{{name}}(ndarray[{{c_type}}] left, - ndarray[{{c_type}}] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - {{c_type}} lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[{{c_type}}] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype={{dtype}}) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left, - ndarray[{{c_type}}] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - {{c_type}} lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[{{c_type}}] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype={{dtype}}) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -{{endfor}} diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0b9793a6ef97a..0c081986d83c5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- from decimal import Decimal +from fractions import Fraction +from numbers import Number + import sys import cython @@ -15,10 +18,9 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT) PyDateTime_IMPORT - import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, +from numpy cimport (ndarray, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, int64_t, @@ -45,19 +47,19 @@ cdef extern from "numpy/arrayobject.h": cdef extern from "src/parse_helper.h": - int floatify(object, double *result, int *maybe_int) except -1 + int floatify(object, float64_t *result, int *maybe_int) except -1 cimport util -from util cimport (is_nan, - UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN) +from util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN from tslib import array_to_datetime +from tslibs.nattype cimport NPY_NAT from tslibs.nattype import NaT from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 from tslibs.timezones cimport get_timezone, tz_compare -from missing cimport (checknull, +from missing cimport (checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period) @@ -67,16 +69,13 @@ cdef object oINT64_MAX = INT64_MAX cdef object oINT64_MIN = INT64_MIN cdef object oUINT64_MAX = UINT64_MAX -cdef int64_t NPY_NAT = util.get_nat() -iNaT = util.get_nat() - cdef bint PY2 = sys.version_info[0] == 2 -cdef double nan = np.NaN +cdef float64_t NaN = np.NaN -def values_from_object(object obj): +def values_from_object(obj: object): """ return my values or the object if we are say an ndarray """ - cdef func # TODO: Does declaring this without a type accomplish anything? + func: object func = getattr(obj, 'get_values', None) if func is not None: @@ -104,27 +103,58 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t: # ---------------------------------------------------------------------- -def is_scalar(val: object) -> bint: +def is_scalar(val: object) -> bool: """ Return True if given value is scalar. - This includes: - - numpy array scalar (e.g. np.int64) - - Python builtin numerics - - Python builtin byte arrays and strings - - None - - instances of datetime.datetime - - instances of datetime.timedelta - - Period - - instances of decimal.Decimal - - Interval - - DateOffset + Parameters + ---------- + val : object + This includes: + + - numpy array scalar (e.g. np.int64) + - Python builtin numerics + - Python builtin byte arrays and strings + - None + - datetime.datetime + - datetime.timedelta + - Period + - decimal.Decimal + - Interval + - DateOffset + - Fraction + - Number + + Returns + ------- + bool + Return True if given object is scalar, False otherwise + + Examples + -------- + >>> dt = pd.datetime.datetime(2018, 10, 3) + >>> pd.is_scalar(dt) + True + + >>> pd.api.types.is_scalar([2, 3]) + False + + >>> pd.api.types.is_scalar({0: 1, 2: 3}) + False + >>> pd.api.types.is_scalar((0, 2)) + False + + pandas supports PEP 3141 numbers: + + >>> from fractions import Fraction + >>> pd.api.types.is_scalar(Fraction(3, 5)) + True """ return (cnp.PyArray_IsAnyScalar(val) # As of numpy-1.9, PyArray_IsAnyScalar misses bytearrays on Py3. - or isinstance(val, bytes) + or isinstance(val, (bytes, Fraction, Number)) # We differ from numpy (as of 1.10), which claims that None is # not scalar in np.isscalar(). or val is None @@ -195,7 +225,7 @@ def fast_unique_multiple(list arrays): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list(list lists, bint sort=True): +def fast_unique_multiple_list(lists: list, sort: bool=True) -> list: cdef: list buf Py_ssize_t k = len(lists) @@ -263,7 +293,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): @cython.wraparound(False) @cython.boundscheck(False) -def dicts_to_array(list dicts, list columns): +def dicts_to_array(dicts: list, columns: list): cdef: Py_ssize_t i, j, k, n ndarray[object, ndim=2] result @@ -304,7 +334,7 @@ def fast_zip(list ndarrays): # initialize tuples on first pass arr = ndarrays[0] - it = PyArray_IterNew(arr) + it = PyArray_IterNew(arr) for i in range(n): val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) tup = PyTuple_New(k) @@ -316,7 +346,7 @@ def fast_zip(list ndarrays): for j in range(1, k): arr = ndarrays[j] - it = PyArray_IterNew(arr) + it = PyArray_IterNew(arr) if len(arr) != n: raise ValueError('all arrays must be same length') @@ -347,7 +377,7 @@ def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): int64_t idx rev_indexer = np.empty(length, dtype=np.int64) - rev_indexer.fill(-1) + rev_indexer[:] = -1 for i in range(n): idx = indexer[i] if idx != -1: @@ -356,7 +386,9 @@ def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): return rev_indexer -def has_infs_f4(ndarray[float32_t] arr) -> bint: +@cython.wraparound(False) +@cython.boundscheck(False) +def has_infs_f4(ndarray[float32_t] arr) -> bool: cdef: Py_ssize_t i, n = len(arr) float32_t inf, neginf, val @@ -371,7 +403,9 @@ def has_infs_f4(ndarray[float32_t] arr) -> bint: return False -def has_infs_f8(ndarray[float64_t] arr) -> bint: +@cython.wraparound(False) +@cython.boundscheck(False) +def has_infs_f8(ndarray[float64_t] arr) -> bool: cdef: Py_ssize_t i, n = len(arr) float64_t inf, neginf, val @@ -423,6 +457,8 @@ def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len): return slice(vstart, vlast - 1, k) +@cython.wraparound(False) +@cython.boundscheck(False) def maybe_booleans_to_slice(ndarray[uint8_t] mask): cdef: Py_ssize_t i, n = len(mask) @@ -454,7 +490,7 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): @cython.wraparound(False) @cython.boundscheck(False) -def array_equivalent_object(left: object[:], right: object[:]) -> bint: +def array_equivalent_object(left: object[:], right: object[:]) -> bool: """ perform an element by element comparion on 1-d object arrays taking into account nan positions """ cdef: @@ -473,62 +509,112 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bint: return True +@cython.wraparound(False) +@cython.boundscheck(False) def astype_intsafe(ndarray[object] arr, new_dtype): cdef: Py_ssize_t i, n = len(arr) - object v + object val bint is_datelike ndarray result - # on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird - is_datelike = new_dtype in ['M8[ns]', 'm8[ns]'] - + is_datelike = new_dtype == 'm8[ns]' result = np.empty(n, dtype=new_dtype) for i in range(n): - v = arr[i] - if is_datelike and checknull(v): + val = arr[i] + if is_datelike and checknull(val): result[i] = NPY_NAT else: - result[i] = v + result[i] = val return result -def astype_unicode(arr: ndarray) -> ndarray[object]: +@cython.wraparound(False) +@cython.boundscheck(False) +def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]: + """ + Convert all elements in an array to unicode. + + Parameters + ---------- + arr : ndarray + The array whose elements we are casting. + skipna : bool, default False + Whether or not to coerce nulls to their stringified form + (e.g. NaN becomes 'nan'). + + Returns + ------- + casted_arr : ndarray + A new array with the input array's elements casted. + """ cdef: + object arr_i Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) for i in range(n): - result[i] = unicode(arr[i]) + arr_i = arr[i] + + if not (skipna and checknull(arr_i)): + arr_i = unicode(arr_i) + + result[i] = arr_i return result -def astype_str(arr: ndarray) -> ndarray[object]: +@cython.wraparound(False) +@cython.boundscheck(False) +def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: + """ + Convert all elements in an array to string. + + Parameters + ---------- + arr : ndarray + The array whose elements we are casting. + skipna : bool, default False + Whether or not to coerce nulls to their stringified form + (e.g. NaN becomes 'nan'). + + Returns + ------- + casted_arr : ndarray + A new array with the input array's elements casted. + """ cdef: + object arr_i Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) for i in range(n): - result[i] = str(arr[i]) + arr_i = arr[i] + + if not (skipna and checknull(arr_i)): + arr_i = str(arr_i) + + result[i] = arr_i return result -def clean_index_list(list obj): +@cython.wraparound(False) +@cython.boundscheck(False) +def clean_index_list(obj: list): """ Utility used in pandas.core.index.ensure_index """ cdef: Py_ssize_t i, n = len(obj) - object v + object val bint all_arrays = 1 for i in range(n): - v = obj[i] - if not (isinstance(v, list) or - util.is_array(v) or hasattr(v, '_data')): + val = obj[i] + if not (isinstance(val, list) or + util.is_array(val) or hasattr(val, '_data')): all_arrays = 0 break @@ -537,12 +623,10 @@ def clean_index_list(list obj): # don't force numpy coerce with nan's inferred = infer_dtype(obj) - if inferred in ['string', 'bytes', 'unicode', - 'mixed', 'mixed-integer']: + if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: - - # TODO: we infer an integer but it *could* be a unint64 + # TODO: we infer an integer but it *could* be a uint64 try: return np.asarray(obj, dtype='int64'), 0 except OverflowError: @@ -572,7 +656,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, nat_count = 0 if hasnans: - mask = values == iNaT + mask = values == NPY_NAT nat_count = np.sum(mask) values = values[~mask] @@ -628,7 +712,7 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, Py_ssize_t i, j, n, k, pos = 0 ndarray[float64_t, ndim=2] out - n, k = ( values).shape + n, k = (values).shape assert (n == len(mask)) out = np.empty((mask.sum(), k), dtype=np.float64) @@ -650,7 +734,7 @@ def row_bool_subset_object(ndarray[object, ndim=2] values, Py_ssize_t i, j, n, k, pos = 0 ndarray[object, ndim=2] out - n, k = ( values).shape + n, k = (values).shape assert (n == len(mask)) out = np.empty((mask.sum(), k), dtype=object) @@ -698,7 +782,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, ndarray[int64_t, ndim=2] counts assert (axis == 0 or axis == 1) - n, k = ( mask).shape + n, k = (mask).shape if axis == 0: counts = np.zeros((max_bin, k), dtype='i8') @@ -789,19 +873,19 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, # core.common import for fast inference checks -def is_float(obj: object) -> bint: +def is_float(obj: object) -> bool: return util.is_float_object(obj) -def is_integer(obj: object) -> bint: +def is_integer(obj: object) -> bool: return util.is_integer_object(obj) -def is_bool(obj: object) -> bint: +def is_bool(obj: object) -> bool: return util.is_bool_object(obj) -def is_complex(obj: object) -> bint: +def is_complex(obj: object) -> bool: return util.is_complex_object(obj) @@ -813,7 +897,7 @@ cpdef bint is_interval(object obj): return getattr(obj, '_typ', '_typ') == 'interval' -def is_period(val: object) -> bint: +def is_period(val: object) -> bool: """ Return a boolean if this is a Period object """ return util.is_period_object(val) @@ -994,7 +1078,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(object value, bint skipna=False): +def infer_dtype(value: object, skipna: bool=False) -> str: """ Efficiently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -1125,6 +1209,9 @@ def infer_dtype(object value, bint skipna=False): values = construct_1d_object_array_from_listlike(value) values = getattr(values, 'values', values) + if skipna: + values = values[~isnaobj(values)] + val = _try_infer_map(values) if val is not None: return val @@ -1161,25 +1248,19 @@ def infer_dtype(object value, bint skipna=False): if util.is_datetime64_object(val): if is_datetime64_array(values): return 'datetime64' - elif is_timedelta_or_timedelta64_array(values): - return 'timedelta' elif is_timedelta(val): if is_timedelta_or_timedelta64_array(values): return 'timedelta' elif util.is_integer_object(val): - # a timedelta will show true here as well - if is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): - return 'timedelta' + # ordering matters here; this check must come after the is_timedelta + # check otherwise numpy timedelta64 objects would come through here if is_integer_array(values): return 'integer' elif is_integer_float_array(values): return 'mixed-integer-float' - elif is_timedelta_or_timedelta64_array(values): - return 'timedelta' return 'mixed-integer' elif PyDateTime_Check(val): @@ -1295,7 +1376,7 @@ def infer_datetimelike_array(arr: object) -> object: seen_datetime = 1 elif PyDate_Check(v): seen_date = 1 - elif is_timedelta(v) or util.is_timedelta64_object(v): + elif is_timedelta(v): # timedelta, or timedelta64 seen_timedelta = 1 else: @@ -1574,7 +1655,7 @@ cpdef bint is_datetime64_array(ndarray values): return validator.validate(values) -def is_datetime_with_singletz_array(values: ndarray) -> bint: +def is_datetime_with_singletz_array(values: ndarray) -> bool: """ Check values have the same tzinfo attribute. Doesn't check values are datetime-like types. @@ -1585,20 +1666,22 @@ def is_datetime_with_singletz_array(values: ndarray) -> bint: if n == 0: return False - + # Get a reference timezone to compare with the rest of the tzs in the array for i in range(n): base_val = values[i] if base_val is not NaT: base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) - - for j in range(i, n): - val = values[j] - if val is not NaT: - tz = getattr(val, 'tzinfo', None) - if not tz_compare(base_tz, tz): - return False break + for j in range(i, n): + # Compare val's timezone with the reference timezone + # NaT can coexist with tz-aware datetimes, so skip if encountered + val = values[j] + if val is not NaT: + tz = getattr(val, 'tzinfo', None) + if not tz_compare(base_tz, tz): + return False + return True @@ -1610,27 +1693,6 @@ cdef class TimedeltaValidator(TemporalValidator): return is_null_timedelta64(value) -# TODO: Not used outside of tests; remove? -def is_timedelta_array(values: ndarray) -> bint: - cdef: - TimedeltaValidator validator = TimedeltaValidator(len(values), - skipna=True) - return validator.validate(values) - - -cdef class Timedelta64Validator(TimedeltaValidator): - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_timedelta64_object(value) - - -# TODO: Not used outside of tests; remove? -def is_timedelta64_array(values: ndarray) -> bint: - cdef: - Timedelta64Validator validator = Timedelta64Validator(len(values), - skipna=True) - return validator.validate(values) - - cdef class AnyTimedeltaValidator(TimedeltaValidator): cdef inline bint is_value_typed(self, object value) except -1: return is_timedelta(value) @@ -1757,7 +1819,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, if val.__hash__ is not None and val in na_values: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN elif util.is_float_object(val): fval = val if fval != fval: @@ -1788,11 +1850,11 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, seen.bool_ = True elif val is None: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN elif hasattr(val, '__len__') and len(val) == 0: if convert_empty or seen.coerce_numeric: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN else: raise ValueError('Empty string encountered') elif util.is_complex_object(val): @@ -1807,7 +1869,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, if fval in na_values: seen.saw_null() - floats[i] = complexes[i] = nan + floats[i] = complexes[i] = NaN else: if fval != fval: seen.null_ = True @@ -1840,7 +1902,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, elif "uint64" in str(e): # Exception from check functions. raise seen.saw_null() - floats[i] = nan + floats[i] = NaN if seen.check_uint64_conflict(): return values @@ -1908,10 +1970,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, floats[i] = complexes[i] = fnan elif val is NaT: if convert_datetime: - idatetimes[i] = iNaT + idatetimes[i] = NPY_NAT seen.datetime_ = 1 if convert_timedelta: - itimedeltas[i] = iNaT + itimedeltas[i] = NPY_NAT seen.timedelta_ = 1 if not (convert_datetime or convert_timedelta): seen.object_ = 1 @@ -1938,8 +2000,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, break elif util.is_integer_object(val): seen.int_ = 1 - floats[i] = val - complexes[i] = val + floats[i] = val + complexes[i] = val if not seen.null_: seen.saw_int(int(val)) @@ -1988,7 +2050,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: - if len({getattr(val, 'tzinfo', None) for val in objects}) == 1: + if is_datetime_with_singletz_array(objects): from pandas import DatetimeIndex return DatetimeIndex(objects) seen.object_ = 1 @@ -2058,6 +2120,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects +@cython.boundscheck(False) +@cython.wraparound(False) def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, bint convert=1): """ @@ -2081,11 +2145,11 @@ def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, result = np.empty(n, dtype=object) for i in range(n): if mask[i]: - val = util.get_value_at(arr, i) + val = arr[i] else: - val = f(util.get_value_at(arr, i)) + val = f(arr[i]) - if util.is_array(val) and PyArray_NDIM(val) == 0: + if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 # TODO: is there a faster way to unbox? # item_from_zerodim? @@ -2102,6 +2166,8 @@ def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, return result +@cython.boundscheck(False) +@cython.wraparound(False) def map_infer(ndarray arr, object f, bint convert=1): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -2123,9 +2189,9 @@ def map_infer(ndarray arr, object f, bint convert=1): n = len(arr) result = np.empty(n, dtype=object) for i in range(n): - val = f(util.get_value_at(arr, i)) + val = f(arr[i]) - if util.is_array(val) and PyArray_NDIM(val) == 0: + if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 # TODO: is there a faster way to unbox? # item_from_zerodim? @@ -2142,7 +2208,7 @@ def map_infer(ndarray arr, object f, bint convert=1): return result -def to_object_array(list rows, int min_width=0): +def to_object_array(rows: list, min_width: int=0): """ Convert a list of lists into an object array. @@ -2201,7 +2267,7 @@ def tuples_to_object_array(ndarray[object] tuples): return result -def to_object_array_tuples(list rows): +def to_object_array_tuples(rows: list): cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result @@ -2211,7 +2277,7 @@ def to_object_array_tuples(list rows): k = 0 for i in range(n): - tmp = len(rows[i]) + tmp = 1 if checknull(rows[i]) else len(rows[i]) if tmp > k: k = tmp @@ -2225,13 +2291,15 @@ def to_object_array_tuples(list rows): except Exception: # upcast any subclasses to tuple for i in range(n): - row = tuple(rows[i]) + row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) for j in range(len(row)): result[i, j] = row[j] return result +@cython.wraparound(False) +@cython.boundscheck(False) def fast_multiget(dict mapping, ndarray keys, default=np.nan): cdef: Py_ssize_t i, n = len(keys) diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 2c1f13eeb5dff..d0dd306680ae8 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- -from tslibs.nattype cimport is_null_datetimelike +from numpy cimport ndarray, uint8_t cpdef bint checknull(object val) cpdef bint checknull_old(object val) +cpdef ndarray[uint8_t] isnaobj(ndarray arr) cdef bint is_null_datetime64(v) cdef bint is_null_timedelta64(v) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 2590a30c57f33..1fdb04dd10d8e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -5,16 +5,17 @@ from cython import Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t +from numpy cimport ndarray, int64_t, uint8_t, float64_t cnp.import_array() cimport util from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value +from tslibs.nattype cimport checknull_with_nat from tslibs.nattype import NaT -cdef double INF = np.inf -cdef double NEGINF = -INF +cdef float64_t INF = np.inf +cdef float64_t NEGINF = -INF cdef int64_t NPY_NAT = util.get_nat() @@ -124,7 +125,7 @@ cdef inline bint _check_none_nan_inf_neginf(object val): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj(ndarray arr): +cpdef ndarray[uint8_t] isnaobj(ndarray arr): """ Return boolean mask denoting which elements of a 1-D array are na-like, according to the criteria defined in `_check_all_nulls`: @@ -224,7 +225,7 @@ def isnaobj2d(ndarray arr): assert arr.ndim == 2, "'arr' must be 2-D." - n, m = ( arr).shape + n, m = (arr).shape result = np.zeros((n, m), dtype=np.uint8) for i in range(n): for j in range(m): @@ -268,7 +269,7 @@ def isnaobj2d_old(ndarray arr): assert arr.ndim == 2, "'arr' must be 2-D." - n, m = ( arr).shape + n, m = (arr).shape result = np.zeros((n, m), dtype=np.uint8) for i in range(n): for j in range(m): @@ -278,14 +279,14 @@ def isnaobj2d_old(ndarray arr): return result.view(np.bool_) -cpdef bint isposinf_scalar(object val): +def isposinf_scalar(val: object) -> bool: if util.is_float_object(val) and val == INF: return True else: return False -cpdef bint isneginf_scalar(object val): +def isneginf_scalar(val: object) -> bool: if util.is_float_object(val) and val == NEGINF: return True else: @@ -295,9 +296,7 @@ cpdef bint isneginf_scalar(object val): cdef inline bint is_null_datetime64(v): # determine if we have a null for a datetime (or integer versions), # excluding np.timedelta64('nat') - if v is None or util.is_nan(v): - return True - elif v is NaT: + if checknull_with_nat(v): return True elif util.is_datetime64_object(v): return v.view('int64') == NPY_NAT @@ -307,9 +306,7 @@ cdef inline bint is_null_datetime64(v): cdef inline bint is_null_timedelta64(v): # determine if we have a null for a timedelta (or integer versions), # excluding np.datetime64('nat') - if v is None or util.is_nan(v): - return True - elif v is NaT: + if checknull_with_nat(v): return True elif util.is_timedelta64_object(v): return v.view('int64') == NPY_NAT @@ -319,8 +316,4 @@ cdef inline bint is_null_timedelta64(v): cdef inline bint is_null_period(v): # determine if we have a null for a Period (or integer versions), # excluding np.datetime64('nat') and np.timedelta64('nat') - if v is None or util.is_nan(v): - return True - elif v is NaT: - return True - return False + return checknull_with_nat(v) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e3df391c5c45d..a459057555cf3 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -65,8 +65,8 @@ CParserError = ParserError cdef bint PY3 = (sys.version_info[0] >= 3) -cdef double INF = np.inf -cdef double NEGINF = -INF +cdef float64_t INF = np.inf +cdef float64_t NEGINF = -INF cdef extern from "errno.h": @@ -132,6 +132,7 @@ cdef extern from "parser/tokenizer.h": int64_t *word_starts # where we are in the stream int64_t words_len int64_t words_cap + int64_t max_words_cap # maximum word cap encountered char *pword_start # pointer to stream start of current field int64_t word_start # position start of current field @@ -182,10 +183,10 @@ cdef extern from "parser/tokenizer.h": int64_t skip_first_N_rows int64_t skipfooter # pick one, depending on whether the converter requires GIL - double (*double_converter_nogil)(const char *, char **, - char, char, char, int) nogil - double (*double_converter_withgil)(const char *, char **, - char, char, char, int) + float64_t (*double_converter_nogil)(const char *, char **, + char, char, char, int) nogil + float64_t (*double_converter_withgil)(const char *, char **, + char, char, char, int) # error handling char *warn_msg @@ -233,12 +234,12 @@ cdef extern from "parser/tokenizer.h": uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) nogil - double xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) nogil - double precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) nogil - double round_trip(const char *p, char **q, char decimal, char sci, + float64_t xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing) nogil + float64_t precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) nogil + float64_t round_trip(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) nogil int to_boolean(const char *item, uint8_t *val) nogil @@ -302,6 +303,7 @@ cdef class TextReader: object tupleize_cols object usecols list dtype_cast_order + set unnamed_cols set noconvert def __cinit__(self, source, @@ -361,7 +363,7 @@ cdef class TextReader: if not isinstance(encoding, bytes): encoding = encoding.encode('utf-8') encoding = encoding.lower() - self.c_encoding = encoding + self.c_encoding = encoding else: self.c_encoding = NULL @@ -536,7 +538,7 @@ cdef class TextReader: self.header = [ header ] self.names = names - self.header, self.table_width = self._get_header() + self.header, self.table_width, self.unnamed_cols = self._get_header() if not self.table_width: raise EmptyDataError("No columns to parse from file") @@ -611,7 +613,7 @@ cdef class TextReader: for i in self.skiprows: parser_add_skiprow(self.parser, i) else: - self.parser.skipfunc = self.skiprows + self.parser.skipfunc = self.skiprows cdef _setup_parser_source(self, source): cdef: @@ -668,7 +670,7 @@ cdef class TextReader: source = icom.UTF8Recoder(source, self.encoding.decode('utf-8')) self.encoding = b'utf-8' - self.c_encoding = self.encoding + self.c_encoding = self.encoding self.handle = source @@ -720,13 +722,15 @@ cdef class TextReader: cdef: Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa char *word - object name + object name, old_name int status int64_t hr, data_line char *errors = "strict" cdef StringPath path = _string_path(self.c_encoding) header = [] + unnamed_cols = set() + if self.parser.header_start >= 0: # Header is in the file @@ -759,6 +763,7 @@ cdef class TextReader: counts = {} unnamed_count = 0 + for i in range(field_count): word = self.parser.words[start + i] @@ -770,6 +775,9 @@ cdef class TextReader: name = PyUnicode_Decode(word, strlen(word), self.c_encoding, errors) + # We use this later when collecting placeholder names. + old_name = name + if name == '': if self.has_mi_columns: name = ('Unnamed: {i}_level_{lvl}' @@ -786,6 +794,9 @@ cdef class TextReader: name = '%s.%d' % (name, count) count = counts.get(name, 0) + if old_name == '': + unnamed_cols.add(name) + this_header.append(name) counts[name] = count + 1 @@ -798,6 +809,7 @@ cdef class TextReader: lc = len(this_header) ic = (len(self.index_col) if self.index_col is not None else 0) + if lc != unnamed_count and lc - ic > unnamed_count: hr -= 1 self.parser_start -= 1 @@ -830,7 +842,7 @@ cdef class TextReader: if self.parser.lines < 1: self._tokenize_rows(1) - return None, self.parser.line_fields[0] + return None, self.parser.line_fields[0], unnamed_cols # Corner case, not enough lines in the file if self.parser.lines < data_line + 1: @@ -864,7 +876,7 @@ cdef class TextReader: elif self.allow_leading_cols and passed_count < field_count: self.leading_cols = field_count - passed_count - return header, field_count + return header, field_count, unnamed_cols def read(self, rows=None): """ @@ -1058,18 +1070,6 @@ cdef class TextReader: conv = self._get_converter(i, name) - # XXX - na_flist = set() - if self.na_filter: - na_list, na_flist = self._get_na_list(i, name) - if na_list is None: - na_filter = 0 - else: - na_filter = 1 - na_hashset = kset_from_list(na_list) - else: - na_filter = 0 - col_dtype = None if self.dtype is not None: if isinstance(self.dtype, dict): @@ -1094,13 +1094,34 @@ cdef class TextReader: self.c_encoding) continue - # Should return as the desired dtype (inferred or specified) - col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, - na_flist, col_dtype) + # Collect the list of NaN values associated with the column. + # If we aren't supposed to do that, or none are collected, + # we set `na_filter` to `0` (`1` otherwise). + na_flist = set() - if na_filter: - self._free_na_set(na_hashset) + if self.na_filter: + na_list, na_flist = self._get_na_list(i, name) + if na_list is None: + na_filter = 0 + else: + na_filter = 1 + na_hashset = kset_from_list(na_list) + else: + na_filter = 0 + + # Attempt to parse tokens and infer dtype of the column. + # Should return as the desired dtype (inferred or specified). + try: + col_res, na_count = self._convert_tokens( + i, start, end, name, na_filter, na_hashset, + na_flist, col_dtype) + finally: + # gh-21353 + # + # Cleanup the NaN hash that we generated + # to avoid memory leaks. + if na_filter: + self._free_na_set(na_hashset) if upcast_na and na_count > 0: col_res = _maybe_upcast(col_res) @@ -1181,7 +1202,20 @@ cdef class TextReader: bint user_dtype, kh_str_t *na_hashset, object na_flist): - if is_integer_dtype(dtype): + if is_categorical_dtype(dtype): + # TODO: I suspect that _categorical_convert could be + # optimized when dtype is an instance of CategoricalDtype + codes, cats, na_count = _categorical_convert( + self.parser, i, start, end, na_filter, + na_hashset, self.c_encoding) + + # Method accepts list of strings, not encoded ones. + true_values = [x.decode() for x in self.true_values] + cat = Categorical._from_inferred_categories( + cats, codes, dtype, true_values=true_values) + return cat, na_count + + elif is_integer_dtype(dtype): try: result, na_count = _try_int64(self.parser, i, start, end, na_filter, na_hashset) @@ -1211,7 +1245,12 @@ cdef class TextReader: result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) + if user_dtype and na_count is not None: + if na_count > 0: + raise ValueError("Bool column has NA values in " + "column {column}".format(column=i)) return result, na_count + elif dtype.kind == 'S': # TODO: na handling width = dtype.itemsize @@ -1231,15 +1270,6 @@ cdef class TextReader: # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - elif is_categorical_dtype(dtype): - # TODO: I suspect that _categorical_convert could be - # optimized when dtype is an instance of CategoricalDtype - codes, cats, na_count = _categorical_convert( - self.parser, i, start, end, na_filter, - na_hashset, self.c_encoding) - cat = Categorical._from_inferred_categories(cats, codes, dtype) - return cat, na_count - elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) @@ -1438,13 +1468,13 @@ cdef _string_box_factorize(parser_t *parser, int64_t col, # in the hash table if k != table.n_buckets: # this increments the refcount, but need to test - pyval = table.vals[k] + pyval = table.vals[k] else: # box it. new ref? pyval = PyBytes_FromString(word) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1492,13 +1522,13 @@ cdef _string_box_utf8(parser_t *parser, int64_t col, # in the hash table if k != table.n_buckets: # this increments the refcount, but need to test - pyval = table.vals[k] + pyval = table.vals[k] else: # box it. new ref? pyval = PyUnicode_FromString(word) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1549,14 +1579,14 @@ cdef _string_box_decode(parser_t *parser, int64_t col, # in the hash table if k != table.n_buckets: # this increments the refcount, but need to test - pyval = table.vals[k] + pyval = table.vals[k] else: # box it. new ref? size = strlen(word) pyval = PyUnicode_Decode(word, size, encoding, errors) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1648,7 +1678,7 @@ cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, ndarray result result = np.empty(line_end - line_start, dtype='|S%d' % width) - data = result.data + data = result.data with nogil: _to_fw_string_nogil(parser, col, line_start, line_end, width, data) @@ -1686,8 +1716,8 @@ cdef _try_double(parser_t *parser, int64_t col, coliter_t it const char *word = NULL char *p_end - double *data - double NA = na_values[np.float64] + float64_t *data + float64_t NA = na_values[np.float64] kh_float64_t *na_fset ndarray result khiter_t k @@ -1695,7 +1725,7 @@ cdef _try_double(parser_t *parser, int64_t col, lines = line_end - line_start result = np.empty(lines, dtype=np.float64) - data = result.data + data = result.data na_fset = kset_float64_from_list(na_flist) if parser.double_converter_nogil != NULL: # if it can run without the GIL with nogil: @@ -1706,8 +1736,8 @@ cdef _try_double(parser_t *parser, int64_t col, else: assert parser.double_converter_withgil != NULL error = _try_double_nogil(parser, - parser.double_converter_withgil, col, line_start, line_end, na_filter, na_hashset, use_na_flist, @@ -1719,14 +1749,14 @@ cdef _try_double(parser_t *parser, int64_t col, cdef inline int _try_double_nogil(parser_t *parser, - double (*double_converter)( + float64_t (*double_converter)( const char *, char **, char, char, char, int) nogil, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, bint use_na_flist, const kh_float64_t *na_flist, - double NA, double *data, + float64_t NA, float64_t *data, int *na_count) nogil: cdef: int error, @@ -1803,7 +1833,7 @@ cdef _try_uint64(parser_t *parser, int64_t col, lines = line_end - line_start result = np.empty(lines, dtype=np.uint64) - data = result.data + data = result.data uint_state_init(&state) coliter_setup(&it, parser, col, line_start) @@ -1879,7 +1909,7 @@ cdef _try_int64(parser_t *parser, int64_t col, lines = line_end - line_start result = np.empty(lines, dtype=np.int64) - data = result.data + data = result.data coliter_setup(&it, parser, col, line_start) with nogil: error = _try_int64_nogil(parser, col, line_start, line_end, @@ -1951,7 +1981,7 @@ cdef _try_bool_flex(parser_t *parser, int64_t col, lines = line_end - line_start result = np.empty(lines, dtype=np.uint8) - data = result.data + data = result.data with nogil: error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset, true_hashset, @@ -2047,6 +2077,7 @@ cdef kh_str_t* kset_from_list(list values) except NULL: # None creeps in sometimes, which isn't possible here if not isinstance(val, bytes): + kh_destroy_str(table) raise ValueError('Must be all encoded bytes') k = kh_put_str(table, PyBytes_AsString(val), &ret) @@ -2087,14 +2118,14 @@ cdef raise_parser_error(object base, parser_t *parser): Py_XDECREF(traceback) if value != NULL: - old_exc = value + old_exc = value Py_XDECREF(value) # PyErr_Fetch only returned the error message in *value, # so the Exception class must be extracted from *type. if isinstance(old_exc, compat.string_types): if type != NULL: - exc_type = type + exc_type = type else: exc_type = ParserError diff --git a/pandas/_libs/properties.pyx b/pandas/_libs/properties.pyx index 6e4c0c62b0dd8..d2fbf5aa66fbf 100644 --- a/pandas/_libs/properties.pyx +++ b/pandas/_libs/properties.pyx @@ -31,7 +31,7 @@ cdef class CachedProperty(object): if PyDict_Contains(cache, self.name): # not necessary to Py_INCREF - val = PyDict_GetItem(cache, self.name) + val = PyDict_GetItem(cache, self.name) else: val = self.func(obj) PyDict_SetItem(cache, self.name, val) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 681ea2c6295f2..6f892c928805e 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -153,7 +153,7 @@ cdef class Reducer: result = _get_result_array(res, self.nresults, len(self.dummy)) - it = PyArray_IterNew(result) + it = PyArray_IterNew(result) PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) chunk.data = chunk.data + self.increment @@ -438,6 +438,7 @@ cdef inline _extract_result(object res): res = res[0] return res + cdef class Slider: """ Only handles contiguous data for now @@ -466,7 +467,7 @@ cdef class Slider: self.buf.strides[0] = self.stride cpdef advance(self, Py_ssize_t k): - self.buf.data = self.buf.data + self.stride * k + self.buf.data = self.buf.data + self.stride * k cdef move(self, int start, int end): """ @@ -571,9 +572,9 @@ cdef class BlockSlider: self.idx_slider = Slider( self.frame.index.values, self.dummy.index.values) - self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) + self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) for i, block in enumerate(self.blocks): - self.base_ptrs[i] = ( block).data + self.base_ptrs[i] = (block).data def __dealloc__(self): free(self.base_ptrs) diff --git a/pandas/_libs/skiplist.pyx b/pandas/_libs/skiplist.pyx index eec0457fc4caf..6698fcb767d7c 100644 --- a/pandas/_libs/skiplist.pyx +++ b/pandas/_libs/skiplist.pyx @@ -105,7 +105,7 @@ cdef class IndexableSkiplist: steps += steps_at_level[level] for level in range(d, self.maxlevels): - ( chain[level]).width[level] += 1 + (chain[level]).width[level] += 1 self.size += 1 @@ -126,11 +126,11 @@ cdef class IndexableSkiplist: chain[level] = node - if value != ( ( ( chain[0]).next)[0]).value: + if value != (((chain[0]).next)[0]).value: raise KeyError('Not Found') # remove one link at each level - d = len(( ( ( chain[0]).next)[0]).next) + d = len((((chain[0]).next)[0]).next) for level in range(d): prevnode = chain[level] diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index d852711d3b707..f5980998f6db4 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -8,22 +8,11 @@ from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, cnp.import_array() -from distutils.version import LooseVersion - -# numpy versioning -_np_version = np.version.short_version -_np_version_under1p10 = LooseVersion(_np_version) < LooseVersion('1.10') -_np_version_under1p11 = LooseVersion(_np_version) < LooseVersion('1.11') - - # ----------------------------------------------------------------------------- # Preamble stuff -cdef float64_t NaN = np.NaN -cdef float64_t INF = np.inf - -cdef inline int int_max(int a, int b): return a if a >= b else b -cdef inline int int_min(int a, int b): return a if a <= b else b +cdef float64_t NaN = np.NaN +cdef float64_t INF = np.inf # ----------------------------------------------------------------------------- @@ -68,6 +57,10 @@ cdef class IntIndex(SparseIndex): output += 'Indices: %s\n' % repr(self.indices) return output + @property + def nbytes(self): + return self.indices.nbytes + def check_integrity(self): """ Checks the following: @@ -217,7 +210,7 @@ cdef class IntIndex(SparseIndex): n = len(indexer) results = np.empty(n, dtype=np.int32) - results.fill(-1) + results[:] = -1 if self.npoints == 0: return results @@ -246,9 +239,9 @@ cdef class IntIndex(SparseIndex): sinds = self.indices result = np.empty(other.npoints, dtype=np.float64) - result.fill(fill_value) + result[:] = fill_value - for 0 <= i < other.npoints: + for i in range(other.npoints): while oinds[i] > sinds[j] and j < self.npoints: j += 1 @@ -271,6 +264,7 @@ cdef class IntIndex(SparseIndex): ndarray[int32_t, ndim=1] indices): pass + cpdef get_blocks(ndarray[int32_t, ndim=1] indices): cdef: Py_ssize_t init_len, i, npoints, result_indexer = 0 @@ -311,6 +305,7 @@ cpdef get_blocks(ndarray[int32_t, ndim=1] indices): lens = lens[:result_indexer] return locs, lens + # ----------------------------------------------------------------------------- # BlockIndex @@ -336,8 +331,8 @@ cdef class BlockIndex(SparseIndex): self.blengths = np.ascontiguousarray(blengths, dtype=np.int32) # in case we need - self.locbuf = self.blocs.data - self.lenbuf = self.blengths.data + self.locbuf = self.blocs.data + self.lenbuf = self.blengths.data self.length = length self.nblocks = np.int32(len(self.blocs)) @@ -359,6 +354,10 @@ cdef class BlockIndex(SparseIndex): return output + @property + def nbytes(self): + return self.blocs.nbytes + self.blengths.nbytes + @property def ngaps(self): return self.length - self.npoints @@ -572,7 +571,7 @@ cdef class BlockIndex(SparseIndex): n = len(indexer) results = np.empty(n, dtype=np.int32) - results.fill(-1) + results[:] = -1 if self.npoints == 0: return results @@ -662,11 +661,6 @@ cdef class BlockMerge(object): self.xi = yi self.yi = xi -cdef class BlockIntersection(BlockMerge): - """ - not done yet - """ - pass cdef class BlockUnion(BlockMerge): """ @@ -793,70 +787,15 @@ cdef class BlockUnion(BlockMerge): include "sparse_op_helper.pxi" -# ----------------------------------------------------------------------------- -# Indexing operations - -def get_reindexer(ndarray[object, ndim=1] values, dict index_map): - cdef object idx - cdef Py_ssize_t i - cdef Py_ssize_t new_length = len(values) - cdef ndarray[int32_t, ndim=1] indexer - - indexer = np.empty(new_length, dtype=np.int32) - - for i in range(new_length): - idx = values[i] - if idx in index_map: - indexer[i] = index_map[idx] - else: - indexer[i] = -1 - - return indexer - -# def reindex_block(ndarray[float64_t, ndim=1] values, -# BlockIndex sparse_index, -# ndarray[int32_t, ndim=1] indexer): -# cdef: -# Py_ssize_t i, length -# ndarray[float64_t, ndim=1] out - -# out = np.empty(length, dtype=np.float64) - -# for i in range(length): -# if indexer[i] == -1: -# pass - - -# cdef class SparseCruncher(object): -# """ -# Class to acquire float pointer for convenient operations on sparse data -# structures -# """ -# cdef: -# SparseIndex index -# float64_t* buf - -# def __init__(self, ndarray[float64_t, ndim=1, mode='c'] values, -# SparseIndex index): - -# self.index = index -# self.buf = values.data - - -def reindex_integer(ndarray[float64_t, ndim=1] values, - IntIndex sparse_index, - ndarray[int32_t, ndim=1] indexer): - pass - - # ----------------------------------------------------------------------------- # SparseArray mask create operations def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value): - cdef object value - cdef Py_ssize_t i - cdef Py_ssize_t new_length = len(arr) - cdef ndarray[int8_t, ndim=1] mask + cdef: + object value + Py_ssize_t i + Py_ssize_t new_length = len(arr) + ndarray[int8_t, ndim=1] mask mask = np.ones(new_length, dtype=np.int8) diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index 2843a3cf7dd28..c6621ab5977ca 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -4,22 +4,16 @@ Template for each `dtype` helper function for sparse ops WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Sparse op -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- -{{py: - -# dtype, float_group -dtypes = [('float64', True), ('int64', False)] - -}} +ctypedef fused sparse_t: + float64_t + int64_t -{{for dtype, float_group in dtypes}} -{{if float_group}} - -cdef inline {{dtype}}_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): +cdef inline float64_t __div__(sparse_t a, sparse_t b): if b == 0: if a > 0: return INF @@ -30,63 +24,34 @@ cdef inline {{dtype}}_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): else: return float(a) / b -cdef inline {{dtype}}_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): - return __div_{{dtype}}(a, b) - -cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): - if b == 0: - # numpy >= 1.11 returns NaN - # for a // 0, rather than +-inf - if _np_version_under1p11: - if a > 0: - return INF - elif a < 0: - return -INF - return NaN - else: - return a // b -cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): - if b == 0: - return NaN - else: - return a % b +cdef inline float64_t __truediv__(sparse_t a, sparse_t b): + return __div__(a, b) -{{else}} -cdef inline float64_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): +cdef inline sparse_t __mod__(sparse_t a, sparse_t b): if b == 0: - if a > 0: - return INF - elif a < 0: - return -INF - else: + if sparse_t is float64_t: return NaN + else: + return 0 else: - return float(a) / b + return a % b -cdef inline float64_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): - return __div_{{dtype}}(a, b) -cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): +cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b): if b == 0: - return 0 + if sparse_t is float64_t: + return NaN + else: + return 0 else: return a // b -cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): - if b == 0: - return 0 - else: - return a % b - -{{endif}} - -{{endfor}} -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # sparse array op -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -106,10 +71,10 @@ def get_op(tup): ops_dict = {'add': '{0} + {1}', 'sub': '{0} - {1}', 'mul': '{0} * {1}', - 'div': '__div_{2}({0}, {1})', - 'mod': '__mod_{2}({0}, {1})', - 'truediv': '__truediv_{2}({0}, {1})', - 'floordiv': '__floordiv_{2}({0}, {1})', + 'div': '__div__({0}, {1})', + 'mod': '__mod__({0}, {1})', + 'truediv': '__truediv__({0}, {1})', + 'floordiv': '__floordiv__({0}, {1})', 'pow': '{0} ** {1}', 'eq': '{0} == {1}', 'ne': '{0} != {1}', diff --git a/pandas/_libs/src/headers/cmath b/pandas/_libs/src/headers/cmath index 2bccf9bb13d77..632e1fc2390d0 100644 --- a/pandas/_libs/src/headers/cmath +++ b/pandas/_libs/src/headers/cmath @@ -1,16 +1,36 @@ #ifndef _PANDAS_MATH_H_ #define _PANDAS_MATH_H_ +// MSVC 2017 has a bug where `x == x` can be true for NaNs. +// MSC_VER from https://stackoverflow.com/a/70630/1889400 +// Place upper bound on this check once a fixed MSVC is released. +#if defined(_MSC_VER) && (_MSC_VER < 1800) +#include // In older versions of Visual Studio there wasn't a std::signbit defined // This defines it using _copysign -#if defined(_MSC_VER) && (_MSC_VER < 1800) +namespace std { + __inline int isnan(double x) { return _isnan(x); } + __inline int signbit(double num) { return _copysign(1.0, num) < 0; } + __inline int notnan(double x) { return !isnan(x); } +} +#elif defined(_MSC_VER) && (_MSC_VER >= 1900) +#include +namespace std { + __inline int isnan(double x) { return _isnan(x); } + __inline int notnan(double x) { return !isnan(x); } +} +#elif defined(_MSC_VER) #include namespace std { __inline int isnan(double x) { return _isnan(x); } - __inline int signbit(double num) { return _copysign(1.0, num) < 0; } + __inline int notnan(double x) { return x == x; } } #else #include -#endif +namespace std { + __inline int notnan(double x) { return x == x; } +} + +#endif #endif diff --git a/pandas/_libs/src/headers/portable.h b/pandas/_libs/src/headers/portable.h index b9868276ef6e6..9ac4ebc306baa 100644 --- a/pandas/_libs/src/headers/portable.h +++ b/pandas/_libs/src/headers/portable.h @@ -5,4 +5,10 @@ #define strcasecmp( s1, s2 ) _stricmp( s1, s2 ) #endif +// GH-23516 - works around locale perf issues +// from MUSL libc, MIT Licensed - see LICENSES +#define isdigit_ascii(c) ((unsigned)c - '0' < 10) +#define isspace_ascii(c) (c == ' ' || (unsigned)c-'\t' < 5) +#define toupper_ascii(c) (((unsigned)c-'a' < 26) ? (c & 0x5f) : c) + #endif diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 4f9f825b15ffe..b71131bee7008 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -138,11 +138,11 @@ int floatify(PyObject *str, double *result, int *maybe_int) { // PANDAS_INLINE void lowercase(char *p) { - for (; *p; ++p) *p = tolower(*p); + for (; *p; ++p) *p = tolower_ascii(*p); } PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper(*p); + for (; *p; ++p) *p = toupper_ascii(*p); } static double xstrtod(const char *str, char **endptr, char decimal, char sci, @@ -177,7 +177,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -188,7 +188,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, *maybe_int = 0; p++; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -207,7 +207,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { *maybe_int = 0; // Handle optional sign @@ -222,7 +222,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -263,7 +263,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2fce241027d56..3a4058f37efc7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -23,6 +23,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include +#include "../headers/portable.h" + static void *safe_realloc(void *buffer, size_t size) { void *result; // OSX is weird. @@ -197,6 +199,7 @@ int parser_init(parser_t *self) { sz = sz ? sz : 1; self->words = (char **)malloc(sz * sizeof(char *)); self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t)); + self->max_words_cap = sz; self->words_cap = sz; self->words_len = 0; @@ -247,7 +250,7 @@ void parser_del(parser_t *self) { } static int make_stream_space(parser_t *self, size_t nbytes) { - int64_t i, cap; + int64_t i, cap, length; int status; void *orig_ptr, *newptr; @@ -287,8 +290,23 @@ static int make_stream_space(parser_t *self, size_t nbytes) { */ cap = self->words_cap; + + /** + * If we are reading in chunks, we need to be aware of the maximum number + * of words we have seen in previous chunks (self->max_words_cap), so + * that way, we can properly allocate when reading subsequent ones. + * + * Otherwise, we risk a buffer overflow if we mistakenly under-allocate + * just because a recent chunk did not have as many words. + */ + if (self->words_len + nbytes < self->max_words_cap) { + length = self->max_words_cap - nbytes; + } else { + length = self->words_len; + } + self->words = - (char **)grow_buffer((void *)self->words, self->words_len, + (char **)grow_buffer((void *)self->words, length, (int64_t*)&self->words_cap, nbytes, sizeof(char *), &status); TRACE( @@ -1241,6 +1259,19 @@ int parser_trim_buffers(parser_t *self) { int64_t i; + /** + * Before we free up space and trim, we should + * save how many words we saw when parsing, if + * it exceeds the maximum number we saw before. + * + * This is important for when we read in chunks, + * so that we can inform subsequent chunk parsing + * as to how many words we could possibly see. + */ + if (self->words_cap > self->max_words_cap) { + self->max_words_cap = self->words_cap; + } + /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { @@ -1382,7 +1413,7 @@ int tokenize_all_rows(parser_t *self) { } PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper(*p); + for (; *p; ++p) *p = toupper_ascii(*p); } int PANDAS_INLINE to_longlong(char *item, long long *p_value) { @@ -1395,7 +1426,7 @@ int PANDAS_INLINE to_longlong(char *item, long long *p_value) { *p_value = strtoll(item, &p_end, 10); // Allow trailing spaces. - while (isspace(*p_end)) ++p_end; + while (isspace_ascii(*p_end)) ++p_end; return (errno == 0) && (!*p_end); } @@ -1512,7 +1543,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, errno = 0; // Skip leading whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; @@ -1529,7 +1560,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits. - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1541,7 +1572,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (*p == decimal) { p++; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1560,7 +1591,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string. - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { // Handle optional sign. negative = 0; switch (*++p) { @@ -1573,7 +1604,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1614,7 +1645,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; @@ -1668,7 +1699,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, errno = 0; // Skip leading whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; @@ -1685,7 +1716,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits. - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { if (num_digits < max_digits) { number = number * 10. + (*p - '0'); num_digits++; @@ -1701,7 +1732,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (*p == decimal) { p++; - while (num_digits < max_digits && isdigit(*p)) { + while (num_digits < max_digits && isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1709,7 +1740,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (num_digits >= max_digits) // Consume extra decimal digits. - while (isdigit(*p)) ++p; + while (isdigit_ascii(*p)) ++p; exponent -= num_decimals; } @@ -1723,7 +1754,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string. - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { // Handle optional sign negative = 0; switch (*++p) { @@ -1736,7 +1767,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1769,7 +1800,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; @@ -1804,7 +1835,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int d; // Skip leading spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1817,7 +1848,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } // Check that there is a first digit. - if (!isdigit(*p)) { + if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; @@ -1836,7 +1867,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number > pre_min) || @@ -1849,7 +1880,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { number = number * 10 - (d - '0'); @@ -1873,7 +1904,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || @@ -1887,7 +1918,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); @@ -1902,7 +1933,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } // Skip trailing spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1925,7 +1956,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, int d; // Skip leading spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1939,7 +1970,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } // Check that there is a first digit. - if (!isdigit(*p)) { + if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; @@ -1955,7 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || @@ -1969,7 +2000,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); @@ -1983,7 +2014,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } // Skip trailing spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 9fc3593aaaf5b..c32c061c7fa89 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -142,6 +142,7 @@ typedef struct parser_t { int64_t *word_starts; // where we are in the stream int64_t words_len; int64_t words_cap; + int64_t max_words_cap; // maximum word cap encountered char *pword_start; // pointer to stream start of current field int64_t word_start; // position start of current field diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9012ebefe0975..efabc5ad0b1ba 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import cython from cython import Py_ssize_t from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, @@ -32,13 +33,15 @@ from tslibs.parsing import parse_datetime_string from tslibs.timedeltas cimport cast_from_unit from tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info +from tslibs.timezones import UTC from tslibs.conversion cimport (tz_convert_single, _TSObject, convert_datetime_to_tsobject, get_datetime64_nanos, tz_convert_utc_to_tzlocal) -from tslibs.nattype import NaT, nat_strings, iNaT -from tslibs.nattype cimport checknull_with_nat, NPY_NAT +# many modules still look for NaT and iNaT here despite them not being needed +from tslibs.nattype import nat_strings, iNaT # noqa:F821 +from tslibs.nattype cimport checknull_with_nat, NPY_NAT, c_NaT as NaT from tslibs.offsets cimport to_offset @@ -71,7 +74,10 @@ cdef inline object create_time_from_ts( return time(dts.hour, dts.min, dts.sec, dts.us, tz) -def ints_to_pydatetime(int64_t[:] arr, tz=None, freq=None, box="datetime"): +@cython.wraparound(False) +@cython.boundscheck(False) +def ints_to_pydatetime(int64_t[:] arr, object tz=None, object freq=None, + str box="datetime"): """ Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp @@ -99,8 +105,9 @@ def ints_to_pydatetime(int64_t[:] arr, tz=None, freq=None, box="datetime"): int64_t[:] deltas Py_ssize_t pos npy_datetimestruct dts - object dt - int64_t value, delta + object dt, new_tz + str typ + int64_t value, delta, local_value ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, object, object) @@ -207,12 +214,14 @@ def _test_parse_iso8601(object ts): check_dts_bounds(&obj.dts) if out_local == 1: obj.tzinfo = pytz.FixedOffset(out_tzoffset) - obj.value = tz_convert_single(obj.value, obj.tzinfo, 'UTC') + obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) return Timestamp(obj.value, tz=obj.tzinfo) else: return Timestamp(obj.value) +@cython.wraparound(False) +@cython.boundscheck(False) def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None): """ @@ -296,7 +305,8 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): +def array_with_unit_to_datetime(ndarray values, object unit, + str errors='coerce'): """ convert the ndarray according to the unit if errors: @@ -335,7 +345,7 @@ def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): # then need to iterate try: iresult = values.astype('i8', casting='same_kind', copy=False) - mask = iresult == iNaT + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False @@ -351,7 +361,7 @@ def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): "'{unit}'".format(unit=unit)) result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') - iresult[mask] = iNaT + iresult[mask] = NPY_NAT return result result = np.empty(n, dtype='M8[ns]') @@ -449,10 +459,11 @@ def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): return oresult -cpdef array_to_datetime(ndarray[object] values, errors='raise', - dayfirst=False, yearfirst=False, - format=None, utc=None, - require_iso8601=False): +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef array_to_datetime(ndarray[object] values, str errors='raise', + bint dayfirst=False, bint yearfirst=False, + object utc=None, bint require_iso8601=False): """ Converts a 1D array of date-like values to a numpy array of either: 1) datetime64[ns] data @@ -476,8 +487,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', dayfirst parsing behavior when encountering datetime strings yearfirst : bool, default False yearfirst parsing behavior when encountering datetime strings - format : str, default None - format of the string to parse utc : bool, default None indicator whether the dates should be UTC require_iso8601 : bool, default False @@ -501,259 +510,259 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', bint is_raise = errors=='raise' bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' + bint is_same_offsets _TSObject _ts + int64_t value int out_local=0, out_tzoffset=0 - float offset_seconds + float offset_seconds, tz_offset set out_tzoffset_vals = set() # specify error conditions assert is_raise or is_ignore or is_coerce + result = np.empty(n, dtype='M8[ns]') + iresult = result.view('i8') + try: - result = np.empty(n, dtype='M8[ns]') - iresult = result.view('i8') for i in range(n): val = values[i] - if checknull_with_nat(val): - iresult[i] = NPY_NAT + try: + if checknull_with_nat(val): + iresult[i] = NPY_NAT - elif PyDateTime_Check(val): - seen_datetime = 1 - if val.tzinfo is not None: - if utc_convert: - try: + elif PyDateTime_Check(val): + seen_datetime = 1 + if val.tzinfo is not None: + if utc_convert: _ts = convert_datetime_to_tsobject(val, None) iresult[i] = _ts.value - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + else: + raise ValueError('Tz-aware datetime.datetime ' + 'cannot be converted to ' + 'datetime64 unless utc=True') else: - raise ValueError('Tz-aware datetime.datetime cannot ' - 'be converted to datetime64 unless ' - 'utc=True') - else: - iresult[i] = pydatetime_to_dt64(val, &dts) - if not PyDateTime_CheckExact(val): - # i.e. a Timestamp object - iresult[i] += val.nanosecond - try: + iresult[i] = pydatetime_to_dt64(val, &dts) + if not PyDateTime_CheckExact(val): + # i.e. a Timestamp object + iresult[i] += val.nanosecond check_dts_bounds(&dts) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - elif PyDate_Check(val): - seen_datetime = 1 - iresult[i] = pydate_to_dt64(val, &dts) - try: + elif PyDate_Check(val): + seen_datetime = 1 + iresult[i] = pydate_to_dt64(val, &dts) check_dts_bounds(&dts) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - elif is_datetime64_object(val): - seen_datetime = 1 - if get_datetime64_value(val) == NPY_NAT: - iresult[i] = NPY_NAT - else: - try: - iresult[i] = get_datetime64_nanos(val) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + elif is_datetime64_object(val): + seen_datetime = 1 + iresult[i] = get_datetime64_nanos(val) - elif is_integer_object(val) or is_float_object(val): - # these must be ns unit by-definition - seen_integer = 1 + elif is_integer_object(val) or is_float_object(val): + # these must be ns unit by-definition + seen_integer = 1 - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT - elif is_raise or is_ignore: - iresult[i] = val - else: - # coerce - # we now need to parse this as if unit='ns' - # we can ONLY accept integers at this point - # if we have previously (or in future accept - # datetimes/strings, then we must coerce) - try: - iresult[i] = cast_from_unit(val, 'ns') - except: + if val != val or val == NPY_NAT: iresult[i] = NPY_NAT + elif is_raise or is_ignore: + iresult[i] = val + else: + # coerce + # we now need to parse this as if unit='ns' + # we can ONLY accept integers at this point + # if we have previously (or in future accept + # datetimes/strings, then we must coerce) + try: + iresult[i] = cast_from_unit(val, 'ns') + except: + iresult[i] = NPY_NAT - elif is_string_object(val): - # string - seen_string = 1 - - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - continue - if isinstance(val, unicode) and PY2: - val = val.encode('utf-8') + elif is_string_object(val): + # string + seen_string = 1 - try: - _string_to_dts(val, &dts, &out_local, &out_tzoffset) - except ValueError: - # A ValueError at this point is a _parsing_ error - # specifically _not_ OutOfBoundsDatetime - if _parse_today_now(val, &iresult[i]): + if len(val) == 0 or val in nat_strings: + iresult[i] = NPY_NAT continue - elif require_iso8601: - # if requiring iso8601 strings, skip trying - # other formats - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError("time data {val} doesn't match " - "format specified" - .format(val=val)) - return values, tz_out + if isinstance(val, unicode) and PY2: + val = val.encode('utf-8') try: - py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst) - except Exception: - if is_coerce: - iresult[i] = NPY_NAT + _string_to_dts(val, &dts, &out_local, &out_tzoffset) + except ValueError: + # A ValueError at this point is a _parsing_ error + # specifically _not_ OutOfBoundsDatetime + if _parse_today_now(val, &iresult[i]): continue - raise TypeError("invalid string coercion to datetime") - - # If the dateutil parser returned tzinfo, capture it - # to check if all arguments have the same tzinfo - tz = py_dt.utcoffset() - if tz is not None: - seen_datetime_offset = 1 - # dateutil timezone objects cannot be hashed, so store - # the UTC offsets in seconds instead - out_tzoffset_vals.add(tz.total_seconds()) - else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add('naive') - try: + elif require_iso8601: + # if requiring iso8601 strings, skip trying + # other formats + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + return values, tz_out + + try: + py_dt = parse_datetime_string(val, + dayfirst=dayfirst, + yearfirst=yearfirst) + except Exception: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise TypeError("invalid string coercion to " + "datetime") + + # If the dateutil parser returned tzinfo, capture it + # to check if all arguments have the same tzinfo + tz = py_dt.utcoffset() + if tz is not None: + seen_datetime_offset = 1 + # dateutil timezone objects cannot be hashed, so + # store the UTC offsets in seconds instead + out_tzoffset_vals.add(tz.total_seconds()) + else: + # Add a marker for naive string, to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add('naive') + _ts = convert_datetime_to_tsobject(py_dt, None) iresult[i] = _ts.value - except OutOfBoundsDatetime: + except: + # TODO: What exception are we concerned with here? if is_coerce: iresult[i] = NPY_NAT continue raise - except: - # TODO: What exception are we concerned with here? + else: + # No error raised by string_to_dts, pick back up + # where we left off + value = dtstruct_to_dt64(&dts) + if out_local == 1: + seen_datetime_offset = 1 + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + out_tzoffset_vals.add(out_tzoffset * 60.) + tz = pytz.FixedOffset(out_tzoffset) + value = tz_convert_single(value, tz, UTC) + else: + # Add a marker for naive string, to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add('naive') + iresult[i] = value + check_dts_bounds(&dts) + + else: if is_coerce: iresult[i] = NPY_NAT - continue - raise - else: - # No error raised by string_to_dts, pick back up - # where we left off - value = dtstruct_to_dt64(&dts) - if out_local == 1: - seen_datetime_offset = 1 - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects - out_tzoffset_vals.add(out_tzoffset * 60.) - tz = pytz.FixedOffset(out_tzoffset) - value = tz_convert_single(value, tz, 'UTC') else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add('naive') - iresult[i] = value - try: - check_dts_bounds(&dts) - except OutOfBoundsDatetime: - # GH#19382 for just-barely-OutOfBounds falling back to - # dateutil parser will return incorrect result because - # it will ignore nanoseconds - if is_coerce: - iresult[i] = NPY_NAT - continue - elif require_iso8601: - if is_raise: - raise ValueError("time data {val} doesn't " - "match format specified" - .format(val=val)) - return values, tz_out - raise + raise TypeError("{typ} is not convertible to datetime" + .format(typ=type(val))) - else: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT - else: - raise TypeError("{typ} is not convertible to datetime" - .format(typ=type(val))) - - if seen_datetime and seen_integer: - # we have mixed datetimes & integers - - if is_coerce: - # coerce all of the integers/floats to NaT, preserve - # the datetimes and other convertibles - for i in range(n): - val = values[i] - if is_integer_object(val) or is_float_object(val): - result[i] = NPY_NAT - elif is_raise: - raise ValueError( - "mixed datetimes and integers in passed array") - else: - raise TypeError - - if seen_datetime_offset and not utc_convert: - # GH 17697 - # 1) If all the offsets are equal, return one offset for - # the parsed dates to (maybe) pass to DatetimeIndex - # 2) If the offsets are different, then force the parsing down the - # object path where an array of datetimes - # (with individual dateutil.tzoffsets) are returned - is_same_offsets = len(out_tzoffset_vals) == 1 - if not is_same_offsets: - return array_to_datetime_object(values, is_raise, - dayfirst, yearfirst) - else: - tz_offset = out_tzoffset_vals.pop() - tz_out = pytz.FixedOffset(tz_offset / 60.) - return result, tz_out + continue + elif require_iso8601 and is_string_object(val): + # GH#19382 for just-barely-OutOfBounds falling back to + # dateutil parser will return incorrect result because + # it will ignore nanoseconds + if is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + assert is_ignore + return values, tz_out + raise + except OutOfBoundsDatetime: if is_raise: raise - oresult = np.empty(n, dtype=object) - for i in range(n): - val = values[i] + return ignore_errors_out_of_bounds_fallback(values), tz_out - # set as nan except if its a NaT - if checknull_with_nat(val): - if isinstance(val, float): - oresult[i] = np.nan - else: - oresult[i] = NaT - elif is_datetime64_object(val): - if get_datetime64_value(val) == NPY_NAT: - oresult[i] = NaT - else: - oresult[i] = val.item() - else: - oresult[i] = val - return oresult, tz_out except TypeError: return array_to_datetime_object(values, is_raise, dayfirst, yearfirst) + if seen_datetime and seen_integer: + # we have mixed datetimes & integers + + if is_coerce: + # coerce all of the integers/floats to NaT, preserve + # the datetimes and other convertibles + for i in range(n): + val = values[i] + if is_integer_object(val) or is_float_object(val): + result[i] = NPY_NAT + elif is_raise: + raise ValueError("mixed datetimes and integers in passed array") + else: + return array_to_datetime_object(values, is_raise, + dayfirst, yearfirst) + + if seen_datetime_offset and not utc_convert: + # GH#17697 + # 1) If all the offsets are equal, return one offset for + # the parsed dates to (maybe) pass to DatetimeIndex + # 2) If the offsets are different, then force the parsing down the + # object path where an array of datetimes + # (with individual dateutil.tzoffsets) are returned + is_same_offsets = len(out_tzoffset_vals) == 1 + if not is_same_offsets: + return array_to_datetime_object(values, is_raise, + dayfirst, yearfirst) + else: + tz_offset = out_tzoffset_vals.pop() + tz_out = pytz.FixedOffset(tz_offset / 60.) + return result, tz_out + + +cdef inline ignore_errors_out_of_bounds_fallback(ndarray[object] values): + """ + Fallback for array_to_datetime if an OutOfBoundsDatetime is raised + and errors == "ignore" + + Parameters + ---------- + values : ndarray[object] + + Returns + ------- + ndarray[object] + """ + cdef: + Py_ssize_t i, n = len(values) + object val + + oresult = np.empty(n, dtype=object) + + for i in range(n): + val = values[i] + + # set as nan except if its a NaT + if checknull_with_nat(val): + if isinstance(val, float): + oresult[i] = np.nan + else: + oresult[i] = NaT + elif is_datetime64_object(val): + if get_datetime64_value(val) == NPY_NAT: + oresult[i] = NaT + else: + oresult[i] = val.item() + else: + oresult[i] = val + return oresult + +@cython.wraparound(False) +@cython.boundscheck(False) cdef array_to_datetime_object(ndarray[object] values, bint is_raise, - dayfirst=False, yearfirst=False): + bint dayfirst=False, bint yearfirst=False): """ Fall back function for array_to_datetime diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 7d58b43e5d460..587213049af85 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -49,12 +49,15 @@ DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', int_to_weekday = {num: name for num, name in enumerate(DAYS)} weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday} +DAY_SECONDS = 86400 +HOUR_SECONDS = 3600 + # ---------------------------------------------------------------------- @cython.wraparound(False) @cython.boundscheck(False) -cpdef inline int32_t get_days_in_month(int year, Py_ssize_t month) nogil: +cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil: """Return the number of days in the given month of the given year. Parameters diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index d7eef546befbd..e6e7884f05b20 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- - import cython from cython import Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport int64_t, int32_t, ndarray +from numpy cimport uint8_t, int64_t, int32_t, ndarray cnp.import_array() import pytz +from dateutil.tz import tzutc # stdlib datetime imports from datetime import time as datetime_time @@ -17,6 +17,8 @@ from cpython.datetime cimport (datetime, tzinfo, PyDateTime_CheckExact, PyDateTime_IMPORT) PyDateTime_IMPORT +from ccalendar import DAY_SECONDS, HOUR_SECONDS + from np_datetime cimport (check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, _string_to_dts, @@ -28,31 +30,28 @@ from np_datetime import OutOfBoundsDatetime from util cimport (is_string_object, is_datetime64_object, - is_integer_object, is_float_object, is_array) + is_integer_object, is_float_object) from timedeltas cimport cast_from_unit from timezones cimport (is_utc, is_tzlocal, is_fixed_offset, - treat_tz_as_dateutil, treat_tz_as_pytz, get_utcoffset, get_dst_info, get_timezone, maybe_get_tz, tz_compare) +from timezones import UTC from parsing import parse_datetime_string -from nattype import nat_strings, NaT -from nattype cimport NPY_NAT, checknull_with_nat +from nattype import nat_strings +from nattype cimport NPY_NAT, checknull_with_nat, c_NaT as NaT # ---------------------------------------------------------------------- # Constants -cdef int64_t DAY_NS = 86400000000000LL NS_DTYPE = np.dtype('M8[ns]') TD_DTYPE = np.dtype('m8[ns]') -UTC = pytz.UTC # ---------------------------------------------------------------------- # Misc Helpers -# TODO: How to declare np.datetime64 as the input type? cdef inline int64_t get_datetime64_nanos(object val) except? -1: """ Extract the value and unit from a np.datetime64 object, then convert the @@ -63,8 +62,11 @@ cdef inline int64_t get_datetime64_nanos(object val) except? -1: NPY_DATETIMEUNIT unit npy_datetime ival - unit = get_datetime64_unit(val) ival = get_datetime64_value(val) + if ival == NPY_NAT: + return NPY_NAT + + unit = get_datetime64_unit(val) if unit != NPY_FR_ns: pandas_datetime_to_datetimestruct(ival, unit, &dts) @@ -74,7 +76,9 @@ cdef inline int64_t get_datetime64_nanos(object val) except? -1: return ival -def ensure_datetime64ns(ndarray arr, copy=True): +@cython.boundscheck(False) +@cython.wraparound(False) +def ensure_datetime64ns(arr: ndarray, copy: bool=True): """ Ensure a np.datetime64 array has dtype specifically 'datetime64[ns]' @@ -94,14 +98,17 @@ def ensure_datetime64ns(ndarray arr, copy=True): NPY_DATETIMEUNIT unit npy_datetimestruct dts - shape = ( arr).shape + shape = (arr).shape ivalues = arr.view(np.int64).ravel() - result = np.empty(shape, dtype='M8[ns]') + result = np.empty(shape, dtype=NS_DTYPE) iresult = result.ravel().view(np.int64) if len(iresult) == 0: + result = arr.view(NS_DTYPE) + if copy: + result = result.copy() return result unit = get_datetime64_unit(arr.flat[0]) @@ -121,7 +128,7 @@ def ensure_datetime64ns(ndarray arr, copy=True): return result -def ensure_timedelta64ns(ndarray arr, copy=True): +def ensure_timedelta64ns(arr: ndarray, copy: bool=True): """ Ensure a np.timedelta64 array has dtype specifically 'timedelta64[ns]' @@ -136,20 +143,23 @@ def ensure_timedelta64ns(ndarray arr, copy=True): """ return arr.astype(TD_DTYPE, copy=copy) + # TODO: check for overflows when going from a lower-resolution to nanos -def datetime_to_datetime64(object[:] values): +@cython.boundscheck(False) +@cython.wraparound(False) +def datetime_to_datetime64(values: object[:]): """ Convert ndarray of datetime-like objects to int64 array representing nanosecond timestamps. Parameters ---------- - values : ndarray + values : ndarray[object] Returns ------- - result : ndarray with dtype int64 + result : ndarray[int64_t] inferred_tz : tzinfo or None """ cdef: @@ -225,6 +235,7 @@ cdef class _TSObject: @property def value(self): + # This is needed in order for `value` to be accessible in lib.pyx return self.value @@ -275,10 +286,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, if ts is None or ts is NaT: obj.value = NPY_NAT elif is_datetime64_object(ts): - if ts.view('i8') == NPY_NAT: - obj.value = NPY_NAT - else: - obj.value = get_datetime64_nanos(ts) + obj.value = get_datetime64_nanos(ts) + if obj.value != NPY_NAT: dt64_to_dtstruct(obj.value, &obj.dts) elif is_integer_object(ts): if ts == NPY_NAT: @@ -357,7 +366,7 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, else: # UTC obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = pytz.utc + obj.tzinfo = tz else: obj.value = pydatetime_to_dt64(ts, &obj.dts) obj.tzinfo = ts.tzinfo @@ -437,7 +446,7 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, check_dts_bounds(&obj.dts) if out_local == 1: obj.tzinfo = pytz.FixedOffset(out_tzoffset) - obj.value = tz_convert_single(obj.value, obj.tzinfo, 'UTC') + obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) if tz is None: check_dts_bounds(&obj.dts) check_overflows(obj) @@ -458,8 +467,7 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, if tz is not None: # shift for localize_tso ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, - ambiguous='raise', - errors='raise')[0] + ambiguous='raise')[0] except OutOfBoundsDatetime: # GH#19382 for just-barely-OutOfBounds falling back to dateutil @@ -528,6 +536,7 @@ cdef inline void localize_tso(_TSObject obj, tzinfo tz): int64_t[:] deltas int64_t local_val Py_ssize_t pos + str typ assert obj.tzinfo is None @@ -572,8 +581,6 @@ cdef inline datetime _localize_pydatetime(datetime dt, tzinfo tz): identically, i.e. discards nanos from Timestamps. It also assumes that the `tz` input is not None. """ - if tz == 'UTC' or tz is UTC: - return UTC.localize(dt) try: # datetime.replace with pytz may be incorrect result return tz.localize(dt) @@ -599,8 +606,8 @@ cpdef inline datetime localize_pydatetime(datetime dt, object tz): elif not PyDateTime_CheckExact(dt): # i.e. is a Timestamp return dt.tz_localize(tz) - elif tz == 'UTC' or tz is UTC: - return UTC.localize(dt) + elif is_utc(tz): + return _localize_pydatetime(dt, tz) try: # datetime.replace with pytz may be incorrect result return tz.localize(dt) @@ -611,6 +618,8 @@ cpdef inline datetime localize_pydatetime(datetime dt, object tz): # ---------------------------------------------------------------------- # Timezone Conversion +@cython.boundscheck(False) +@cython.wraparound(False) cdef inline int64_t[:] _tz_convert_dst(int64_t[:] values, tzinfo tz, bint to_utc=True): """ @@ -636,15 +645,20 @@ cdef inline int64_t[:] _tz_convert_dst(int64_t[:] values, tzinfo tz, int64_t[:] deltas int64_t v - trans, deltas, typ = get_dst_info(tz) - if not to_utc: - # We add `offset` below instead of subtracting it - deltas = -1 * np.array(deltas, dtype='i8') + if not is_tzlocal(tz): + # get_dst_info cannot extract offsets from tzlocal because its + # dependent on a datetime + trans, deltas, _ = get_dst_info(tz) + if not to_utc: + # We add `offset` below instead of subtracting it + deltas = -1 * np.array(deltas, dtype='i8') for i in range(n): v = values[i] if v == NPY_NAT: result[i] = v + elif is_tzlocal(tz): + result[i] = _tz_convert_tzlocal_utc(v, tz, to_utc=to_utc) else: # TODO: Is it more efficient to call searchsorted pointwise or # on `values` outside the loop? We are not consistent about this. @@ -678,12 +692,17 @@ cdef inline int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, """ cdef: npy_datetimestruct dts - int64_t result, delta + int64_t delta datetime dt dt64_to_dtstruct(val, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) + dts.min, dts.sec, dts.us) + # get_utcoffset (tz.utcoffset under the hood) only makes sense if datetime + # is _wall time_, so if val is a UTC timestamp convert to wall time + if not to_utc: + dt = dt.replace(tzinfo=tzutc()) + dt = dt.astimezone(tz) delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 if not to_utc: @@ -729,7 +748,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): int64_t arr[1] # See GH#17734 We should always be converting either from UTC or to UTC - assert (is_utc(tz1) or tz1 == 'UTC') or (is_utc(tz2) or tz2 == 'UTC') + assert is_utc(tz1) or is_utc(tz2) if val == NPY_NAT: return val @@ -737,13 +756,13 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): # Convert to UTC if is_tzlocal(tz1): utc_date = _tz_convert_tzlocal_utc(val, tz1, to_utc=True) - elif get_timezone(tz1) != 'UTC': + elif not is_utc(get_timezone(tz1)): arr[0] = val utc_date = _tz_convert_dst(arr, tz1, to_utc=True)[0] else: utc_date = val - if get_timezone(tz2) == 'UTC': + if is_utc(get_timezone(tz2)): return utc_date elif is_tzlocal(tz2): return _tz_convert_tzlocal_utc(utc_date, tz2, to_utc=False) @@ -757,6 +776,8 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): return _tz_convert_dst(arr, tz2, to_utc=False)[0] +@cython.boundscheck(False) +@cython.wraparound(False) cdef inline int64_t[:] _tz_convert_one_way(int64_t[:] vals, object tz, bint to_utc): """ @@ -777,7 +798,7 @@ cdef inline int64_t[:] _tz_convert_one_way(int64_t[:] vals, object tz, Py_ssize_t i, n = len(vals) int64_t val - if get_timezone(tz) != 'UTC': + if not is_utc(get_timezone(tz)): converted = np.empty(n, dtype=np.int64) if is_tzlocal(tz): for i in range(n): @@ -826,7 +847,7 @@ def tz_convert(int64_t[:] vals, object tz1, object tz2): @cython.boundscheck(False) @cython.wraparound(False) def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, - object errors='raise'): + object nonexistent=None): """ Localize tzinfo-naive i8 to given time zone (using pytz). If there are ambiguities in the values, raise AmbiguousTimeError. @@ -836,30 +857,47 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, vals : ndarray[int64_t] tz : tzinfo or None ambiguous : str, bool, or arraylike - If arraylike, must have the same length as vals - errors : {"raise", "coerce"}, default "raise" + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from 03:00 + DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC + and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter + dictates how ambiguous times should be handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for ambiguous + times, but the array must have the same length as vals) + - bool if True, treat all vals as DST. If False, treat them as non-DST + - 'NaT' will return NaT where there are ambiguous times + + nonexistent : {None, "NaT", "shift", "raise"} + How to handle non-existent times when converting wall times to UTC + + .. versionadded:: 0.24.0 Returns ------- localized : ndarray[int64_t] """ cdef: - ndarray[int64_t] trans - int64_t[:] deltas, idx_shifted - ndarray ambiguous_array + int64_t[:] deltas, idx_shifted, idx_shifted_left, idx_shifted_right + ndarray[uint8_t, cast=True] ambiguous_array, both_nat, both_eq Py_ssize_t i, idx, pos, ntrans, n = len(vals) + Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right int64_t *tdata - int64_t v, left, right - ndarray[int64_t] result, result_a, result_b, dst_hours + int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins + int64_t HOURS_NS = HOUR_SECONDS * 1000000000 + ndarray[int64_t] trans, result, result_a, result_b, dst_hours, delta + ndarray trans_idx, grp, a_idx, b_idx, one_diff npy_datetimestruct dts bint infer_dst = False, is_dst = False, fill = False - bint is_coerce = errors == 'coerce', is_raise = errors == 'raise' + bint shift = False, fill_nonexist = False + list trans_grp + str stamp # Vectorized version of DstTzInfo.localize - - assert is_coerce or is_raise - - if tz == UTC or tz is None: + if is_utc(tz) or tz is None: return vals result = np.empty(n, dtype=np.int64) @@ -867,7 +905,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, if is_tzlocal(tz): for i in range(n): v = vals[i] - result[i] = _tz_convert_tzlocal_utc(v, tz, to_utc=True) + if v == NPY_NAT: + result[i] = NPY_NAT + else: + result[i] = _tz_convert_tzlocal_utc(v, tz, to_utc=True) return result if is_string_object(ambiguous): @@ -886,45 +927,51 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, if len(ambiguous) != len(vals): raise ValueError("Length of ambiguous bool-array must be " "the same size as vals") - ambiguous_array = np.asarray(ambiguous) + ambiguous_array = np.asarray(ambiguous, dtype=bool) - trans, deltas, typ = get_dst_info(tz) + if nonexistent == 'NaT': + fill_nonexist = True + elif nonexistent == 'shift': + shift = True + else: + assert nonexistent in ('raise', None), ("nonexistent must be one of" + " {'NaT', 'raise', 'shift'}") - tdata = cnp.PyArray_DATA(trans) + trans, deltas, _ = get_dst_info(tz) + + tdata = cnp.PyArray_DATA(trans) ntrans = len(trans) + # Determine whether each date lies left of the DST transition (store in + # result_a) or right of the DST transition (store in result_b) result_a = np.empty(n, dtype=np.int64) result_b = np.empty(n, dtype=np.int64) - result_a.fill(NPY_NAT) - result_b.fill(NPY_NAT) - - # left side - idx_shifted = (np.maximum(0, trans.searchsorted( - vals - DAY_NS, side='right') - 1)).astype(np.int64) + result_a[:] = NPY_NAT + result_b[:] = NPY_NAT - for i in range(n): - v = vals[i] - deltas[idx_shifted[i]] - pos = bisect_right_i8(tdata, v, ntrans) - 1 + idx_shifted_left = (np.maximum(0, trans.searchsorted( + vals - DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64) - # timestamp falls to the left side of the DST transition - if v + deltas[pos] == vals[i]: - result_a[i] = v - - # right side - idx_shifted = (np.maximum(0, trans.searchsorted( - vals + DAY_NS, side='right') - 1)).astype(np.int64) + idx_shifted_right = (np.maximum(0, trans.searchsorted( + vals + DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64) for i in range(n): - v = vals[i] - deltas[idx_shifted[i]] - pos = bisect_right_i8(tdata, v, ntrans) - 1 + val = vals[i] + v_left = val - deltas[idx_shifted_left[i]] + pos_left = bisect_right_i8(tdata, v_left, ntrans) - 1 + # timestamp falls to the left side of the DST transition + if v_left + deltas[pos_left] == val: + result_a[i] = v_left + v_right = val - deltas[idx_shifted_right[i]] + pos_right = bisect_right_i8(tdata, v_right, ntrans) - 1 # timestamp falls to the right side of the DST transition - if v + deltas[pos] == vals[i]: - result_b[i] = v + if v_right + deltas[pos_right] == val: + result_b[i] = v_right if infer_dst: dst_hours = np.empty(n, dtype=np.int64) - dst_hours.fill(NPY_NAT) + dst_hours[:] = NPY_NAT # Get the ambiguous hours (given the above, these are the hours # where result_a != result_b and neither of them are NAT) @@ -935,13 +982,13 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, stamp = _render_tstamp(vals[trans_idx]) raise pytz.AmbiguousTimeError( "Cannot infer dst time from %s as there " - "are no repeated times" % stamp) + "are no repeated times".format(stamp)) # Split the array into contiguous chunks (where the difference between # indices is 1). These are effectively dst transitions in different # years which is useful for checking that there is not an ambiguous # transition in an individual year. if trans_idx.size > 0: - one_diff = np.where(np.diff(trans_idx) != 1)[0] +1 + one_diff = np.where(np.diff(trans_idx) != 1)[0] + 1 trans_grp = np.array_split(trans_idx, one_diff) # Iterate through each day, if there are no hours where the @@ -960,7 +1007,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, if switch_idx.size > 1: raise pytz.AmbiguousTimeError( "There are %i dst switches when " - "there should only be 1." % switch_idx.size) + "there should only be 1.".format(switch_idx.size)) switch_idx = switch_idx[0] + 1 # Pull the only index and adjust a_idx = grp[:switch_idx] @@ -968,10 +1015,11 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) for i in range(n): + val = vals[i] left = result_a[i] right = result_b[i] - if vals[i] == NPY_NAT: - result[i] = vals[i] + if val == NPY_NAT: + result[i] = val elif left != NPY_NAT and right != NPY_NAT: if left == right: result[i] = left @@ -986,26 +1034,40 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, elif fill: result[i] = NPY_NAT else: - stamp = _render_tstamp(vals[i]) + stamp = _render_tstamp(val) raise pytz.AmbiguousTimeError( "Cannot infer dst time from %r, try using the " - "'ambiguous' argument" % stamp) + "'ambiguous' argument".format(stamp)) elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: result[i] = right else: - if is_coerce: + # Handle nonexistent times + if shift: + # Shift the nonexistent time forward to the closest existing + # time + remaining_mins = val % HOURS_NS + new_local = val + (HOURS_NS - remaining_mins) + delta_idx = trans.searchsorted(new_local, side='right') + # Need to subtract 1 from the delta_idx if the UTC offset of + # the target tz is greater than 0 + delta_idx_offset = int(deltas[0] > 0) + delta_idx = delta_idx - delta_idx_offset + result[i] = new_local - deltas[delta_idx] + elif fill_nonexist: result[i] = NPY_NAT else: - stamp = _render_tstamp(vals[i]) + stamp = _render_tstamp(val) raise pytz.NonExistentTimeError(stamp) return result -cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): - cdef Py_ssize_t pivot, left = 0, right = n +cdef inline Py_ssize_t bisect_right_i8(int64_t *data, + int64_t val, Py_ssize_t n): + cdef: + Py_ssize_t pivot, left = 0, right = n assert n >= 1 @@ -1037,7 +1099,7 @@ cdef inline str _render_tstamp(int64_t val): # Normalization -def normalize_date(object dt): +def normalize_date(dt: object) -> datetime: """ Normalize datetime.datetime value to midnight. Returns datetime.date as a datetime.datetime at midnight @@ -1071,11 +1133,11 @@ def normalize_date(object dt): @cython.wraparound(False) @cython.boundscheck(False) -def normalize_i8_timestamps(int64_t[:] stamps, tz=None): +def normalize_i8_timestamps(int64_t[:] stamps, object tz=None): """ - Normalize each of the (nanosecond) timestamps in the given array by - rounding down to the beginning of the day (i.e. midnight). If `tz` - is not None, then this is midnight for this timezone. + Normalize each of the (nanosecond) timezone aware timestamps in the given + array by rounding down to the beginning of the day (i.e. midnight). + This is midnight for timezone, `tz`. Parameters ---------- @@ -1087,28 +1149,18 @@ def normalize_i8_timestamps(int64_t[:] stamps, tz=None): result : int64 ndarray of converted of normalized nanosecond timestamps """ cdef: - Py_ssize_t i, n = len(stamps) - npy_datetimestruct dts + Py_ssize_t n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - if tz is not None: - tz = maybe_get_tz(tz) - result = _normalize_local(stamps, tz) - else: - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = _normalized_stamp(&dts) + tz = maybe_get_tz(tz) + result = _normalize_local(stamps, tz) return result.base # .base to access underlying np.ndarray @cython.wraparound(False) @cython.boundscheck(False) -cdef int64_t[:] _normalize_local(int64_t[:] stamps, object tz): +cdef int64_t[:] _normalize_local(int64_t[:] stamps, tzinfo tz): """ Normalize each of the (nanosecond) timestamps in the given array by rounding down to the beginning of the day (i.e. midnight) for the @@ -1117,20 +1169,21 @@ cdef int64_t[:] _normalize_local(int64_t[:] stamps, object tz): Parameters ---------- stamps : int64 ndarray - tz : tzinfo or None + tz : tzinfo Returns ------- result : int64 ndarray of converted of normalized nanosecond timestamps """ cdef: - Py_ssize_t n = len(stamps) + Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) ndarray[int64_t] trans int64_t[:] deltas + str typ Py_ssize_t[:] pos npy_datetimestruct dts - int64_t delta + int64_t delta, local_val if is_utc(tz): with nogil: @@ -1193,7 +1246,9 @@ cdef inline int64_t _normalized_stamp(npy_datetimestruct *dts) nogil: return dtstruct_to_dt64(dts) -def is_date_array_normalized(int64_t[:] stamps, tz=None): +@cython.wraparound(False) +@cython.boundscheck(False) +def is_date_array_normalized(int64_t[:] stamps, object tz=None): """ Check if all of the given (nanosecond) timestamps are normalized to midnight, i.e. hour == minute == second == 0. If the optional timezone @@ -1214,6 +1269,7 @@ def is_date_array_normalized(int64_t[:] stamps, tz=None): int64_t[:] deltas npy_datetimestruct dts int64_t local_val, delta + str typ if tz is None or is_utc(tz): for i in range(n): diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 684344ceb9002..72157c2fcb2f3 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -12,7 +12,7 @@ cimport numpy as cnp from numpy cimport ndarray, int64_t, int32_t, int8_t cnp.import_array() -from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL +from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek, get_week_of_year, get_day_of_year) from np_datetime cimport (npy_datetimestruct, pandas_timedeltastruct, @@ -36,11 +36,14 @@ def get_time_micros(ndarray[int64_t] dtindex): cdef: ndarray[int64_t] micros - micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL + micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64) + micros //= 1000LL return micros -def build_field_sarray(ndarray[int64_t] dtindex): +@cython.wraparound(False) +@cython.boundscheck(False) +def build_field_sarray(int64_t[:] dtindex): """ Datetime as int64 representation to a structured array of fields """ @@ -112,7 +115,7 @@ def get_date_name_field(int64_t[:] dtindex, object field, object locale=None): dt64_to_dtstruct(dtindex[i], &dts) dow = dayofweek(dts.year, dts.month, dts.day) out[i] = names[dow].capitalize() - return out + elif field == 'month_name': if locale is None: names = np.array(MONTHS_FULL, dtype=np.object_) @@ -126,12 +129,15 @@ def get_date_name_field(int64_t[:] dtindex, object field, object locale=None): dt64_to_dtstruct(dtindex[i], &dts) out[i] = names[dts.month].capitalize() - return out - raise ValueError("Field %s not supported" % field) + else: + raise ValueError("Field {field} not supported".format(field=field)) + + return out @cython.wraparound(False) +@cython.boundscheck(False) def get_start_end_field(int64_t[:] dtindex, object field, object freqstr=None, int month_kw=12): """ @@ -161,8 +167,8 @@ def get_start_end_field(int64_t[:] dtindex, object field, if freqstr: if freqstr == 'C': - raise ValueError( - "Custom business days is not supported by %s" % field) + raise ValueError("Custom business days is not supported by {field}" + .format(field=field)) is_business = freqstr[0] == 'B' # YearBegin(), BYearBegin() use month = starting month of year. @@ -194,7 +200,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, if (dom == 1 and dow < 5) or (dom <= 3 and dow == 0): out[i] = 1 - return out.view(bool) + else: for i in range(count): if dtindex[i] == NPY_NAT: @@ -206,7 +212,6 @@ def get_start_end_field(int64_t[:] dtindex, object field, if dom == 1: out[i] = 1 - return out.view(bool) elif field == 'is_month_end': if is_business: @@ -226,7 +231,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, if (ldom == doy and dow < 5) or ( dow == 4 and (ldom - doy <= 2)): out[i] = 1 - return out.view(bool) + else: for i in range(count): if dtindex[i] == NPY_NAT: @@ -242,7 +247,6 @@ def get_start_end_field(int64_t[:] dtindex, object field, if ldom == doy: out[i] = 1 - return out.view(bool) elif field == 'is_quarter_start': if is_business: @@ -258,7 +262,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, if ((dts.month - start_month) % 3 == 0) and ( (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 - return out.view(bool) + else: for i in range(count): if dtindex[i] == NPY_NAT: @@ -270,7 +274,6 @@ def get_start_end_field(int64_t[:] dtindex, object field, if ((dts.month - start_month) % 3 == 0) and dom == 1: out[i] = 1 - return out.view(bool) elif field == 'is_quarter_end': if is_business: @@ -291,7 +294,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, (ldom == doy and dow < 5) or ( dow == 4 and (ldom - doy <= 2))): out[i] = 1 - return out.view(bool) + else: for i in range(count): if dtindex[i] == NPY_NAT: @@ -307,7 +310,6 @@ def get_start_end_field(int64_t[:] dtindex, object field, if ((dts.month - end_month) % 3 == 0) and (ldom == doy): out[i] = 1 - return out.view(bool) elif field == 'is_year_start': if is_business: @@ -323,7 +325,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, if (dts.month == start_month) and ( (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 - return out.view(bool) + else: for i in range(count): if dtindex[i] == NPY_NAT: @@ -335,7 +337,6 @@ def get_start_end_field(int64_t[:] dtindex, object field, if (dts.month == start_month) and dom == 1: out[i] = 1 - return out.view(bool) elif field == 'is_year_end': if is_business: @@ -356,7 +357,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, (ldom == doy and dow < 5) or ( dow == 4 and (ldom - doy <= 2))): out[i] = 1 - return out.view(bool) + else: for i in range(count): if dtindex[i] == NPY_NAT: @@ -372,9 +373,11 @@ def get_start_end_field(int64_t[:] dtindex, object field, if (dts.month == end_month) and (ldom == doy): out[i] = 1 - return out.view(bool) - raise ValueError("Field %s not supported" % field) + else: + raise ValueError("Field {field} not supported".format(field=field)) + + return out.view(bool) @cython.wraparound(False) @@ -542,7 +545,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): @cython.wraparound(False) @cython.boundscheck(False) -def get_timedelta_field(ndarray[int64_t] tdindex, object field): +def get_timedelta_field(int64_t[:] tdindex, object field): """ Given a int64-based timedelta index, extract the days, hrs, sec., field and return an array of these values. diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index c555fce9dd007..fff4d04399481 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -154,8 +154,7 @@ cpdef get_freq_code(freqstr): freqstr = (freqstr.rule_code, freqstr.n) if isinstance(freqstr, tuple): - if (is_integer_object(freqstr[0]) and - is_integer_object(freqstr[1])): + if is_integer_object(freqstr[0]) and is_integer_object(freqstr[1]): # e.g., freqstr = (2000, 1) return freqstr else: @@ -171,7 +170,7 @@ cpdef get_freq_code(freqstr): return code, stride if is_integer_object(freqstr): - return (freqstr, 1) + return freqstr, 1 base, stride = _base_and_stride(freqstr) code = _period_str_to_code(base) @@ -183,6 +182,11 @@ cpdef _base_and_stride(freqstr): """ Return base freq and stride info from string representation + Returns + ------- + base : str + stride : int + Examples -------- _freq_and_stride('5Min') -> 'Min', 5 @@ -201,7 +205,7 @@ cpdef _base_and_stride(freqstr): base = groups.group(2) - return (base, stride) + return base, stride cpdef _period_str_to_code(freqstr): diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index 382ac9d323918..f649518e969be 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -1,9 +1,20 @@ # -*- coding: utf-8 -*- +from cpython.datetime cimport datetime + from numpy cimport int64_t cdef int64_t NPY_NAT cdef bint _nat_scalar_rules[6] + +cdef class _NaT(datetime): + cdef readonly: + int64_t value + object freq + +cdef _NaT c_NaT + + cdef bint checknull_with_nat(object val) cdef bint is_null_datetimelike(object val) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index ae4f9c821b5d1..42ec235992089 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -47,7 +47,7 @@ def _make_nan_func(func_name, doc): def _make_nat_func(func_name, doc): def f(*args, **kwargs): - return NaT + return c_NaT f.__name__ = func_name f.__doc__ = doc return f @@ -67,10 +67,10 @@ def _make_error_func(func_name, cls): cdef _nat_divide_op(self, other): - if PyDelta_Check(other) or is_timedelta64_object(other) or other is NaT: + if PyDelta_Check(other) or is_timedelta64_object(other) or other is c_NaT: return np.nan if is_integer_object(other) or is_float_object(other): - return NaT + return c_NaT return NotImplemented @@ -82,22 +82,23 @@ cdef _nat_rdivide_op(self, other): def __nat_unpickle(*args): # return constant defined in the module - return NaT + return c_NaT # ---------------------------------------------------------------------- cdef class _NaT(datetime): - cdef readonly: - int64_t value - object freq + # cdef readonly: + # int64_t value + # object freq def __hash__(_NaT self): # py3k needs this defined here return hash(self.value) def __richcmp__(_NaT self, object other, int op): - cdef int ndim = getattr(other, 'ndim', -1) + cdef: + int ndim = getattr(other, 'ndim', -1) if ndim == -1: return _nat_scalar_rules[op] @@ -115,18 +116,18 @@ cdef class _NaT(datetime): def __add__(self, other): if PyDateTime_Check(other): - return NaT + return c_NaT elif hasattr(other, 'delta'): # Timedelta, offsets.Tick, offsets.Week - return NaT + return c_NaT elif getattr(other, '_typ', None) in ['dateoffset', 'series', 'period', 'datetimeindex', 'timedeltaindex']: # Duplicate logic in _Timestamp.__add__ to avoid needing # to subclass; allows us to @final(_Timestamp.__add__) return NotImplemented - return NaT + return c_NaT def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing @@ -181,20 +182,7 @@ cdef class _NaT(datetime): def to_datetime64(self): """ Returns a numpy.datetime64 object with 'ns' precision """ - return np.datetime64('NaT') - - -class NaTType(_NaT): - """(N)ot-(A)-(T)ime, the time equivalent of NaN""" - - def __new__(cls): - cdef _NaT base - - base = _NaT.__new__(cls, 1, 1, 1) - base.value = NPY_NAT - base.freq = None - - return base + return np.datetime64('NaT', 'ns') def __repr__(self): return 'NaT' @@ -215,20 +203,11 @@ class NaTType(_NaT): def __long__(self): return NPY_NAT - def __reduce_ex__(self, protocol): - # python 3.6 compat - # http://bugs.python.org/issue28730 - # now __reduce_ex__ is defined and higher priority than __reduce__ - return self.__reduce__() - - def __reduce__(self): - return (__nat_unpickle, (None, )) - def total_seconds(self): """ Total duration of timedelta in seconds (to ns precision) """ - # GH 10939 + # GH#10939 return np.nan @property @@ -259,6 +238,28 @@ class NaTType(_NaT): def is_year_end(self): return False + +class NaTType(_NaT): + """(N)ot-(A)-(T)ime, the time equivalent of NaN""" + + def __new__(cls): + cdef _NaT base + + base = _NaT.__new__(cls, 1, 1, 1) + base.value = NPY_NAT + base.freq = None + + return base + + def __reduce_ex__(self, protocol): + # python 3.6 compat + # http://bugs.python.org/issue28730 + # now __reduce_ex__ is defined and higher priority than __reduce__ + return self.__reduce__() + + def __reduce__(self): + return (__nat_unpickle, (None, )) + def __rdiv__(self, other): return _nat_rdivide_op(self, other) @@ -270,7 +271,7 @@ class NaTType(_NaT): def __rmul__(self, other): if is_integer_object(other) or is_float_object(other): - return NaT + return c_NaT return NotImplemented # ---------------------------------------------------------------------- @@ -484,6 +485,17 @@ class NaTType(_NaT): - 'raise' will raise an AmbiguousTimeError for an ambiguous time .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 Raises ------ @@ -503,6 +515,17 @@ class NaTType(_NaT): - 'raise' will raise an AmbiguousTimeError for an ambiguous time .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 Raises ------ @@ -522,6 +545,17 @@ class NaTType(_NaT): - 'raise' will raise an AmbiguousTimeError for an ambiguous time .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 Raises ------ @@ -559,19 +593,38 @@ class NaTType(_NaT): None will remove timezone holding local time. ambiguous : bool, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + - bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates) - 'NaT' will return NaT for an ambiguous time - 'raise' will raise an AmbiguousTimeError for an ambiguous time - errors : 'raise', 'coerce', default 'raise' + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 + + errors : 'raise', 'coerce', default None - 'raise' will raise a NonExistentTimeError if a timestamp is not valid in the specified timezone (e.g. due to a transition from - or to DST time) + or to DST time). Use ``nonexistent='raise'`` instead. - 'coerce' will return NaT if the timestamp can not be converted - into the specified timezone + into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. versionadded:: 0.19.0 + .. deprecated:: 0.24.0 Returns ------- @@ -595,7 +648,7 @@ class NaTType(_NaT): minute : int, optional second : int, optional microsecond : int, optional - nanosecond: int, optional + nanosecond : int, optional tzinfo : tz-convertible, optional fold : int, optional, default is 0 added in 3.6, NotImplemented @@ -606,14 +659,15 @@ class NaTType(_NaT): """) -NaT = NaTType() +c_NaT = NaTType() # C-visible +NaT = c_NaT # Python-visible # ---------------------------------------------------------------------- cdef inline bint checknull_with_nat(object val): """ utility to check if a value is a nat or not """ - return val is None or util.is_nan(val) or val is NaT + return val is None or util.is_nan(val) or val is c_NaT cdef inline bint is_null_datetimelike(object val): @@ -630,7 +684,7 @@ cdef inline bint is_null_datetimelike(object val): """ if val is None or util.is_nan(val): return True - elif val is NaT: + elif val is c_NaT: return True elif util.is_timedelta64_object(val): return val.view('int64') == NPY_NAT diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index e0ecfc24804a9..dbbe9da381f0a 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -37,6 +37,7 @@ cdef extern from "src/datetime/np_datetime_strings.h": npy_datetimestruct *out, int *out_local, int *out_tzoffset) + # ---------------------------------------------------------------------- # numpy object inspection @@ -136,6 +137,7 @@ cdef inline void dt64_to_dtstruct(int64_t dt64, pandas_datetime_to_datetimestruct(dt64, NPY_FR_ns, out) return + cdef inline void td64_to_tdstruct(int64_t td64, pandas_timedeltastruct* out) nogil: """Convenience function to call pandas_timedelta_to_timedeltastruct diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4d611f89bca9c..f3ac102bf177e 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -5,6 +5,7 @@ from cython import Py_ssize_t import time from cpython.datetime cimport (PyDateTime_IMPORT, + PyDateTime_Check, datetime, timedelta, time as dt_time) PyDateTime_IMPORT @@ -25,6 +26,7 @@ from conversion cimport tz_convert_single, pydt_to_i8, localize_pydatetime from nattype cimport NPY_NAT from np_datetime cimport (npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct) +from timezones import UTC # --------------------------------------------------------------------- # Constants @@ -83,6 +85,8 @@ cdef to_offset(object obj): Wrap pandas.tseries.frequencies.to_offset to keep centralize runtime imports """ + if isinstance(obj, _BaseOffset): + return obj from pandas.tseries.frequencies import to_offset return to_offset(obj) @@ -208,7 +212,7 @@ def _to_dt64(dt, dtype='datetime64'): # Thus astype is needed to cast datetime to datetime64[D] if getattr(dt, 'tzinfo', None) is not None: i8 = pydt_to_i8(dt) - dt = tz_convert_single(i8, 'UTC', dt.tzinfo) + dt = tz_convert_single(i8, UTC, dt.tzinfo) dt = np.int64(dt).astype('datetime64[ns]') else: dt = np.datetime64(dt) @@ -282,11 +286,6 @@ class ApplyTypeError(TypeError): pass -# TODO: unused. remove? -class CacheableOffset(object): - _cacheable = True - - # --------------------------------------------------------------------- # Base Classes @@ -296,8 +295,6 @@ class _BaseOffset(object): and will (after pickle errors are resolved) go into a cdef class. """ _typ = "dateoffset" - _normalize_cache = True - _cacheable = False _day_opt = None _attributes = frozenset(['n', 'normalize']) @@ -312,8 +309,13 @@ class _BaseOffset(object): def __eq__(self, other): if is_string_object(other): - other = to_offset(other) - + try: + # GH#23524 if to_offset fails, we are dealing with an + # incomparable type so == is False and != is True + other = to_offset(other) + except ValueError: + # e.g. "infer" + return False try: return self._params == other._params except AttributeError: @@ -351,8 +353,9 @@ class _BaseOffset(object): return {name: kwds[name] for name in kwds if kwds[name] is not None} def __add__(self, other): - if getattr(other, "_typ", None) in ["datetimeindex", - "series", "period"]: + if getattr(other, "_typ", None) in ["datetimeindex", "periodindex", + "datetimearray", "periodarray", + "series", "period", "dataframe"]: # defer to the other class's implementation return other + self try: @@ -361,7 +364,7 @@ class _BaseOffset(object): return NotImplemented def __sub__(self, other): - if isinstance(other, datetime): + if PyDateTime_Check(other): raise TypeError('Cannot subtract datetime from offset.') elif type(other) == type(self): return type(self)(self.n - other.n, normalize=self.normalize, @@ -386,10 +389,6 @@ class _BaseOffset(object): # that allows us to use methods that can go in a `cdef class` return self * 1 - # TODO: this is never true. fix it or get rid of it - def _should_cache(self): - return self.isAnchored() and self._cacheable - def __repr__(self): className = getattr(self, '_outputName', type(self).__name__) @@ -507,7 +506,7 @@ class _Tick(object): # ---------------------------------------------------------------------- # RelativeDelta Arithmetic -cpdef datetime shift_day(datetime other, int days): +def shift_day(other: datetime, days: int) -> datetime: """ Increment the datetime `other` by the given number of days, retaining the time-portion of the datetime. For tz-naive datetimes this is @@ -542,7 +541,8 @@ cdef inline int month_add_months(npy_datetimestruct dts, int months) nogil: New month number after shifting npy_datetimestruct number of months. """ - cdef int new_month = (dts.month + months) % 12 + cdef: + int new_month = (dts.month + months) % 12 return 12 if new_month == 0 else new_month @@ -826,7 +826,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): return np.asarray(out) -cpdef datetime shift_month(datetime stamp, int months, object day_opt=None): +def shift_month(stamp: datetime, months: int, + day_opt: object=None) -> datetime: """ Given a datetime (or Timestamp) `stamp`, an integer `months` and an option `day_opt`, return a new datetimelike that many months later, @@ -956,8 +957,8 @@ cpdef int roll_convention(int other, int n, int compare) nogil: return n -cpdef int roll_qtrday(datetime other, int n, int month, object day_opt, - int modby=3) except? -1: +def roll_qtrday(other: datetime, n: int, month: int, + day_opt: object, modby: int=3) -> int: """ Possibly increment or decrement the number of periods to shift based on rollforward/rollbackward conventions. @@ -999,8 +1000,7 @@ cpdef int roll_qtrday(datetime other, int n, int month, object day_opt, return n -cpdef int roll_yearday(datetime other, int n, int month, - object day_opt) except? -1: +def roll_yearday(other: datetime, n: int, month: int, day_opt: object) -> int: """ Possibly increment or decrement the number of periods to shift based on rollforward/rollbackward conventions. diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 3887957aeefd4..9a01bf378e549 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -118,12 +118,12 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): if getattr(freq, "_typ", None) == "dateoffset": freq = freq.rule_code - if dayfirst is None: + if dayfirst is None or yearfirst is None: from pandas.core.config import get_option - dayfirst = get_option("display.date_dayfirst") - if yearfirst is None: - from pandas.core.config import get_option - yearfirst = get_option("display.date_yearfirst") + if dayfirst is None: + dayfirst = get_option("display.date_dayfirst") + if yearfirst is None: + yearfirst = get_option("display.date_yearfirst") res = parse_datetime_string_with_reso(arg, freq=freq, dayfirst=dayfirst, @@ -361,7 +361,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, return ret, reso -cpdef object _get_rule_month(object source, object default='DEC'): +cdef object _get_rule_month(object source, object default='DEC'): """ Return starting month of given freq, default is December. diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 43dc415bfd464..dfbf24cf177f6 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2,7 +2,6 @@ from datetime import datetime, date from cpython cimport ( - PyUnicode_Check, PyObject_RichCompareBool, Py_EQ, Py_NE) @@ -14,9 +13,9 @@ from libc.stdlib cimport free, malloc from libc.time cimport strftime, tm from libc.string cimport strlen, memset -cimport cython +import cython -from cpython.datetime cimport (PyDateTime_Check, PyDelta_Check, +from cpython.datetime cimport (PyDateTime_Check, PyDelta_Check, PyDate_Check, PyDateTime_IMPORT) # import datetime C API PyDateTime_IMPORT @@ -33,7 +32,7 @@ cdef extern from "src/datetime/np_datetime.h": cimport util from util cimport is_period_object, is_string_object -from timestamps import Timestamp +from timestamps import Timestamp, maybe_integer_op_deprecated from timezones cimport is_utc, is_tzlocal, get_dst_info from timedeltas import Timedelta from timedeltas cimport delta_to_nanoseconds @@ -47,8 +46,9 @@ from frequencies cimport (get_freq_code, get_base_alias, get_rule_month) from parsing import parse_time_string from resolution import Resolution -from nattype import nat_strings, NaT, iNaT -from nattype cimport _nat_scalar_rules, NPY_NAT, is_null_datetimelike +from nattype import nat_strings +from nattype cimport ( + _nat_scalar_rules, NPY_NAT, is_null_datetimelike, c_NaT as NaT) from offsets cimport to_offset from offsets import _Tick @@ -307,6 +307,7 @@ cdef inline int64_t transform_via_day(int64_t ordinal, result = second_func(result, af_info) return result + # -------------------------------------------------------------------- # Conversion _to_ Daily Freq @@ -551,7 +552,7 @@ cdef int64_t asfreq_AtoA(int64_t ordinal, asfreq_info *af_info): cdef int64_t asfreq_AtoQ(int64_t ordinal, asfreq_info *af_info): return transform_via_day(ordinal, af_info, asfreq_AtoDT, - asfreq_DTtoQ); + asfreq_DTtoQ) cdef int64_t asfreq_AtoM(int64_t ordinal, asfreq_info *af_info): @@ -976,7 +977,6 @@ cdef inline int month_to_quarter(int month): # ---------------------------------------------------------------------- # Period logic - @cython.wraparound(False) @cython.boundscheck(False) def dt64arr_to_periodarr(int64_t[:] dtarr, int freq, tz=None): @@ -1041,8 +1041,8 @@ cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): freq_conv_func func asfreq_info af_info - if ordinal == iNaT: - return iNaT + if ordinal == NPY_NAT: + return NPY_NAT func = get_asfreq_func(freq1, freq2) get_asfreq_info(freq1, freq2, end, &af_info) @@ -1106,6 +1106,8 @@ cdef inline int calc_week_end(int freq, int group) nogil: return freq - group +@cython.wraparound(False) +@cython.boundscheck(False) def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ Convert int64-array of period ordinals from one frequency to another, and @@ -1124,11 +1126,11 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): func = get_asfreq_func(freq1, freq2) get_asfreq_info(freq1, freq2, end, &af_info) - mask = arr == iNaT + mask = arr == NPY_NAT if mask.any(): # NaT process for i in range(n): val = arr[i] - if val != iNaT: + if val != NPY_NAT: val = func(val, &af_info) if val == INT32_MIN: raise ValueError("Unable to convert to desired frequency.") @@ -1192,7 +1194,7 @@ def period_format(int64_t value, int freq, object fmt=None): cdef: int freq_group - if value == iNaT: + if value == NPY_NAT: return repr(NaT) if fmt is None: @@ -1248,7 +1250,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): list found_pat = [False] * len(extra_fmts) int year, quarter - if PyUnicode_Check(fmt): + if isinstance(fmt, unicode): fmt = fmt.encode('utf-8') get_date_info(value, freq, &dts) @@ -1259,7 +1261,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): fmt = fmt.replace(pat, repl) found_pat[i] = True - formatted = c_strftime(&dts, fmt) + formatted = c_strftime(&dts, fmt) result = util.char_to_string(formatted) free(formatted) @@ -1381,6 +1383,8 @@ cdef int pdays_in_month(int64_t ordinal, int freq): return ccalendar.get_days_in_month(dts.year, dts.month) +@cython.wraparound(False) +@cython.boundscheck(False) def get_period_field_arr(int code, int64_t[:] arr, int freq): cdef: Py_ssize_t i, sz @@ -1395,7 +1399,7 @@ def get_period_field_arr(int code, int64_t[:] arr, int freq): out = np.empty(sz, dtype=np.int64) for i in range(sz): - if arr[i] == iNaT: + if arr[i] == NPY_NAT: out[i] = -1 continue out[i] = func(arr[i], freq) @@ -1431,6 +1435,8 @@ cdef accessor _get_accessor_func(int code): return NULL +@cython.wraparound(False) +@cython.boundscheck(False) def extract_ordinals(object[:] values, freq): cdef: Py_ssize_t i, n = len(values) @@ -1443,7 +1449,7 @@ def extract_ordinals(object[:] values, freq): p = values[i] if is_null_datetimelike(p): - ordinals[i] = iNaT + ordinals[i] = NPY_NAT else: try: ordinals[i] = p.ordinal @@ -1456,7 +1462,7 @@ def extract_ordinals(object[:] values, freq): p = Period(p, freq=freq) if p is NaT: # input may contain NaT-like string - ordinals[i] = iNaT + ordinals[i] = NPY_NAT else: ordinals[i] = p.ordinal @@ -1562,7 +1568,6 @@ cdef class _Period(object): @classmethod def _maybe_convert_freq(cls, object freq): - if isinstance(freq, (int, tuple)): code, stride = get_freq_code(freq) freq = get_freq_str(code, stride) @@ -1581,7 +1586,7 @@ cdef class _Period(object): """ Fast creation from an ordinal and freq that are already validated! """ - if ordinal == iNaT: + if ordinal == NPY_NAT: return NaT else: freq = cls._maybe_convert_freq(freq) @@ -1645,6 +1650,8 @@ cdef class _Period(object): elif other is NaT: return NaT elif util.is_integer_object(other): + maybe_integer_op_deprecated(self) + ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif (PyDateTime_Check(other) or @@ -1671,6 +1678,8 @@ cdef class _Period(object): neg_other = -other return self + neg_other elif util.is_integer_object(other): + maybe_integer_op_deprecated(self) + ordinal = self.ordinal - other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif is_period_object(other): @@ -1731,7 +1740,7 @@ cdef class _Period(object): ------- Timestamp - See also + See Also -------- Period.end_time : Return the end Timestamp. Period.dayofyear : Return the day of year. @@ -1756,7 +1765,7 @@ cdef class _Period(object): def end_time(self): # freq.n can't be negative or 0 # ordinal = (self + self.freq.n).start_time.value - 1 - ordinal = (self + 1).start_time.value - 1 + ordinal = (self + self.freq).start_time.value - 1 return Timestamp(ordinal) def to_timestamp(self, freq=None, how='start', tz=None): @@ -1769,7 +1778,7 @@ cdef class _Period(object): freq : string or DateOffset Target frequency. Default is 'D' if self.freq is week or longer and 'S' otherwise - how: str, default 'S' (start) + how : str, default 'S' (start) 'S', 'E'. Can be aliased as case insensitive 'Start', 'Finish', 'Begin', 'End' @@ -1783,7 +1792,8 @@ cdef class _Period(object): end = how == 'E' if end: - return (self + 1).to_timestamp(how='start') - Timedelta(1, 'ns') + endpoint = (self + self.freq).to_timestamp(how='start') + return endpoint - Timedelta(1, 'ns') if freq is None: base, mult = get_freq_code(self.freq) @@ -2415,7 +2425,7 @@ class Period(_Period): if (year is None and month is None and quarter is None and day is None and hour is None and minute is None and second is None): - ordinal = iNaT + ordinal = NPY_NAT else: if freq is None: raise ValueError("If value is None, freq cannot be None") @@ -2441,7 +2451,7 @@ class Period(_Period): ordinal = converted.ordinal elif is_null_datetimelike(value) or value in nat_strings: - ordinal = iNaT + ordinal = NPY_NAT elif is_string_object(value) or util.is_integer_object(value): if util.is_integer_object(value): @@ -2449,7 +2459,7 @@ class Period(_Period): value = value.upper() dt, _, reso = parse_time_string(value, freq) if dt is NaT: - ordinal = iNaT + ordinal = NPY_NAT if freq is None: try: @@ -2459,7 +2469,7 @@ class Period(_Period): "Invalid frequency or could not infer: {reso}" .format(reso=reso)) - elif isinstance(value, datetime): + elif PyDateTime_Check(value): dt = value if freq is None: raise ValueError('Must supply freq for datetime value') @@ -2467,7 +2477,7 @@ class Period(_Period): dt = Timestamp(value) if freq is None: raise ValueError('Must supply freq for datetime value') - elif isinstance(value, date): + elif PyDate_Check(value): dt = datetime(year=value.year, month=value.month, day=value.day) if freq is None: raise ValueError('Must supply freq for datetime value') diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index eda4418902513..c02a840281266 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -3,8 +3,6 @@ from numpy cimport int64_t # Exposed for tslib, not intended for outside use. -cdef parse_timedelta_string(object ts) -cpdef int64_t cast_from_unit(object ts, object unit) except? -1 +cdef int64_t cast_from_unit(object ts, object unit) except? -1 cpdef int64_t delta_to_nanoseconds(delta) except? -1 cpdef convert_to_timedelta64(object ts, object unit) -cpdef array_to_timedelta64(object[:] values, unit=*, errors=*) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 9c8be1901d1dc..b0bead2f66ce4 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -6,6 +6,7 @@ import warnings import sys cdef bint PY3 = (sys.version_info[0] >= 3) +import cython from cython import Py_ssize_t from cpython cimport Py_NE, Py_EQ, PyObject_RichCompare @@ -27,27 +28,33 @@ from util cimport (is_timedelta64_object, is_datetime64_object, is_integer_object, is_float_object, is_string_object) +from ccalendar import DAY_SECONDS + from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) -from nattype import nat_strings, NaT -from nattype cimport checknull_with_nat, NPY_NAT +from nattype import nat_strings +from nattype cimport checknull_with_nat, NPY_NAT, c_NaT as NaT from offsets cimport to_offset # ---------------------------------------------------------------------- # Constants -cdef int64_t DAY_NS = 86400000000000LL - # components named tuple Components = collections.namedtuple('Components', [ 'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds', 'nanoseconds']) -cdef dict timedelta_abbrevs = { 'D': 'd', - 'd': 'd', - 'days': 'd', - 'day': 'd', + +cdef dict timedelta_abbrevs = { 'Y': 'Y', + 'y': 'Y', + 'M': 'M', + 'W': 'W', + 'w': 'W', + 'D': 'D', + 'd': 'D', + 'days': 'D', + 'day': 'D', 'hours': 'h', 'hour': 'h', 'hr': 'h', @@ -56,6 +63,7 @@ cdef dict timedelta_abbrevs = { 'D': 'd', 'minute': 'm', 'min': 'm', 'minutes': 'm', + 't': 'm', 's': 's', 'seconds': 's', 'sec': 's', @@ -65,16 +73,19 @@ cdef dict timedelta_abbrevs = { 'D': 'd', 'millisecond': 'ms', 'milli': 'ms', 'millis': 'ms', + 'l': 'ms', 'us': 'us', 'microseconds': 'us', 'microsecond': 'us', 'micro': 'us', 'micros': 'us', + 'u': 'us', 'ns': 'ns', 'nanoseconds': 'ns', 'nano': 'ns', 'nanos': 'ns', - 'nanosecond': 'ns'} + 'nanosecond': 'ns', + 'n': 'ns'} _no_input = object() @@ -82,6 +93,8 @@ _no_input = object() # ---------------------------------------------------------------------- # API +@cython.boundscheck(False) +@cython.wraparound(False) def ints_to_pytimedelta(int64_t[:] arr, box=False): """ convert an i8 repr to an ndarray of timedelta or Timedelta (if box == @@ -119,8 +132,6 @@ def ints_to_pytimedelta(int64_t[:] arr, box=False): # ---------------------------------------------------------------------- cpdef int64_t delta_to_nanoseconds(delta) except? -1: - if util.is_array(delta): - return delta.astype('m8[ns]').astype('int64') if hasattr(delta, 'nanos'): return delta.nanos if hasattr(delta, 'delta'): @@ -129,15 +140,18 @@ cpdef int64_t delta_to_nanoseconds(delta) except? -1: return delta.astype("timedelta64[ns]").item() if is_integer_object(delta): return delta + if PyDelta_Check(delta): + return (delta.days * 24 * 60 * 60 * 1000000 + + delta.seconds * 1000000 + + delta.microseconds) * 1000 - return (delta.days * 24 * 60 * 60 * 1000000 + - delta.seconds * 1000000 + - delta.microseconds) * 1000 + raise TypeError(type(delta)) cpdef convert_to_timedelta64(object ts, object unit): """ - Convert an incoming object to a timedelta64 if possible + Convert an incoming object to a timedelta64 if possible. + Before calling, unit must be standardized to avoid repeated unit conversion Handle these types of objects: - timedelta/Timedelta @@ -198,7 +212,9 @@ cpdef convert_to_timedelta64(object ts, object unit): return ts.astype('timedelta64[ns]') -cpdef array_to_timedelta64(object[:] values, unit='ns', errors='raise'): +@cython.boundscheck(False) +@cython.wraparound(False) +def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. @@ -223,6 +239,7 @@ cpdef array_to_timedelta64(object[:] values, unit='ns', errors='raise'): for i in range(n): result[i] = parse_timedelta_string(values[i]) except: + unit = parse_timedelta_unit(unit) for i in range(n): try: result[i] = convert_to_timedelta64(values[i], unit) @@ -235,15 +252,24 @@ cpdef array_to_timedelta64(object[:] values, unit='ns', errors='raise'): return iresult.base # .base to access underlying np.ndarray -cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: +cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: """ return a casting of the unit represented to nanoseconds round the fractional part of a float to our precision, p """ cdef: int64_t m int p - if unit == 'D' or unit == 'd': - m = 1000000000L * 86400 + if unit == 'Y': + m = 1000000000L * 31556952 + p = 9 + elif unit == 'M': + m = 1000000000L * 2629746 + p = 9 + elif unit == 'W': + m = 1000000000L * DAY_SECONDS * 7 + p = 9 + elif unit == 'D' or unit == 'd': + m = 1000000000L * DAY_SECONDS p = 9 elif unit == 'h': m = 1000000000L * 3600 @@ -368,7 +394,7 @@ cdef inline parse_timedelta_string(object ts): elif current_unit == 'm': current_unit = 's' m = 1000000000L - r = int(''.join(number)) * m + r = int(''.join(number)) * m result += timedelta_as_neg(r, neg) have_hhmmss = 1 else: @@ -388,7 +414,7 @@ cdef inline parse_timedelta_string(object ts): if current_unit != 'm': raise ValueError("expected hh:mm:ss format before .") m = 1000000000L - r = int(''.join(number)) * m + r = int(''.join(number)) * m result += timedelta_as_neg(r, neg) have_value = 1 unit, number, frac = [], [], [] @@ -422,7 +448,7 @@ cdef inline parse_timedelta_string(object ts): else: m = 10**(9 -len(frac)) - r = int(''.join(frac)) * m + r = int(''.join(frac)) * m result += timedelta_as_neg(r, neg) # we have a regular format @@ -431,7 +457,7 @@ cdef inline parse_timedelta_string(object ts): if current_unit != 'm': raise ValueError("expected hh:mm:ss format") m = 1000000000L - r = int(''.join(number)) * m + r = int(''.join(number)) * m result += timedelta_as_neg(r, neg) # we have a last abbreviation @@ -480,7 +506,11 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): try: unit = ''.join(unit) - unit = timedelta_abbrevs[unit.lower()] + if unit == 'M': + # To parse ISO 8601 string, 'M' should be treated as minute, + # not month + unit = 'm' + unit = parse_timedelta_unit(unit) except KeyError: raise ValueError("invalid abbreviation: {unit}".format(unit=unit)) @@ -488,6 +518,22 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): return cast_from_unit(float(n), unit) +cpdef inline object parse_timedelta_unit(object unit): + """ + Parameters + ---------- + unit : an unit string + """ + if unit is None: + return 'ns' + elif unit == 'M': + return unit + try: + return timedelta_abbrevs[unit.lower()] + except (KeyError, AttributeError): + raise ValueError("invalid unit abbreviation: {unit}" + .format(unit=unit)) + # ---------------------------------------------------------------------- # Timedelta ops utilities @@ -724,27 +770,12 @@ cdef class _Timedelta(timedelta): if is_timedelta64_object(other): other = Timedelta(other) else: - if op == Py_EQ: - return False - elif op == Py_NE: - return True - - # only allow ==, != ops - raise TypeError('Cannot compare type {cls} with ' - 'type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + return NotImplemented if util.is_array(other): return PyObject_RichCompare(np.array([self]), other, op) return PyObject_RichCompare(other, self, reverse_ops[op]) else: - if op == Py_EQ: - return False - elif op == Py_NE: - return True - raise TypeError('Cannot compare type {cls} with type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + return NotImplemented return cmp_scalar(self.value, ots.value, op) @@ -1080,8 +1111,14 @@ class Timedelta(_Timedelta): Parameters ---------- value : Timedelta, timedelta, np.timedelta64, string, or integer - unit : string, {'ns', 'us', 'ms', 's', 'm', 'h', 'D'}, optional + unit : str, optional Denote the unit of the input, if input is an integer. Default 'ns'. + Possible values: + {'Y', 'M', 'W', 'D', 'days', 'day', 'hours', hour', 'hr', 'h', + 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', 'sec', 'second', + 'ms', 'milliseconds', 'millisecond', 'milli', 'millis', 'L', + 'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U', + 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'} days, seconds, microseconds, milliseconds, minutes, hours, weeks : numeric, optional Values for construction in compat with datetime.timedelta. @@ -1131,6 +1168,7 @@ class Timedelta(_Timedelta): value = np.timedelta64(delta_to_nanoseconds(value.delta), 'ns') elif is_integer_object(value) or is_float_object(value): # unit=None is de-facto 'ns' + unit = parse_timedelta_unit(unit) value = convert_to_timedelta64(value, unit) elif checknull_with_nat(value): return NaT @@ -1228,6 +1266,12 @@ class Timedelta(_Timedelta): return other.delta * self return NotImplemented + elif util.is_nan(other): + # i.e. np.nan, but also catch np.float64("NaN") which would + # otherwise get caught by the hasattr(other, "dtype") branch + # incorrectly return a np.timedelta64 object. + return NaT + elif hasattr(other, 'dtype'): # ndarray-like return other * self.to_timedelta64() @@ -1255,6 +1299,12 @@ class Timedelta(_Timedelta): # convert to Timedelta below pass + elif util.is_nan(other): + # i.e. np.nan, but also catch np.float64("NaN") which would + # otherwise get caught by the hasattr(other, "dtype") branch + # incorrectly return a np.timedelta64 object. + return NaT + elif hasattr(other, 'dtype'): return self.to_timedelta64() / other diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 0c2753dbc6f28..b4862a5f3b02f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import enum import warnings from cpython cimport (PyObject_RichCompareBool, PyObject_RichCompare, @@ -16,16 +17,16 @@ from cpython.datetime cimport (datetime, PyDateTime_IMPORT from util cimport (is_datetime64_object, is_timedelta64_object, - is_integer_object, is_string_object, is_array) + is_integer_object, is_string_object, is_array, + is_offset_object) cimport ccalendar +from ccalendar import DAY_SECONDS from conversion import tz_localize_to_utc, normalize_i8_timestamps from conversion cimport (tz_convert_single, _TSObject, convert_to_tsobject, convert_datetime_to_tsobject) -import enum from fields import get_start_end_field, get_date_name_field -from nattype import NaT -from nattype cimport NPY_NAT +from nattype cimport NPY_NAT, c_NaT as NaT from np_datetime import OutOfBoundsDatetime from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct) @@ -34,14 +35,26 @@ from timedeltas import Timedelta from timedeltas cimport delta_to_nanoseconds from timezones cimport ( get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz, tz_compare) +from timezones import UTC # ---------------------------------------------------------------------- # Constants _zero_time = datetime_time(0, 0) _no_input = object() + # ---------------------------------------------------------------------- +def maybe_integer_op_deprecated(obj): + # GH#22535 add/sub of integers and int-arrays is deprecated + if obj.freq is not None: + warnings.warn("Addition/subtraction of integers and integer-arrays " + "to {cls} is deprecated, will be removed in a future " + "version. Instead of adding/subtracting `n`, use " + "`n * self.freq`" + .format(cls=type(obj).__name__), + FutureWarning) + cdef inline object create_timestamp_from_ts(int64_t value, npy_datetimestruct dts, @@ -107,6 +120,7 @@ cdef inline _npdivmod(x1, x2): try: from numpy import divmod as npdivmod except ImportError: + # numpy < 1.13 npdivmod = _npdivmod @@ -165,7 +179,8 @@ def round_nsint64(values, mode, freq): # if/elif above should catch all rounding modes defined in enum 'RoundTo': # if flow of control arrives here, it is a bug - assert False, "round_nsint64 called with an unrecognized rounding mode" + raise AssertionError("round_nsint64 called with an unrecognized " + "rounding mode") # This is PITA. Because we inherit from datetime, which has very specific @@ -267,7 +282,8 @@ cdef class _Timestamp(datetime): cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: - cdef datetime dtval = self.to_pydatetime() + cdef: + datetime dtval = self.to_pydatetime() self._assert_tzawareness_compat(other) @@ -287,8 +303,7 @@ cdef class _Timestamp(datetime): elif op == Py_GE: return dtval >= other - cdef int _assert_tzawareness_compat(_Timestamp self, - object other) except -1: + cdef _assert_tzawareness_compat(_Timestamp self, datetime other): if self.tzinfo is None: if other.tzinfo is not None: raise TypeError('Cannot compare tz-naive and tz-aware ' @@ -296,7 +311,7 @@ cdef class _Timestamp(datetime): elif other.tzinfo is None: raise TypeError('Cannot compare tz-naive and tz-aware timestamps') - cpdef datetime to_pydatetime(_Timestamp self, warn=True): + cpdef datetime to_pydatetime(_Timestamp self, bint warn=True): """ Convert a Timestamp object to a native Python datetime object. @@ -315,7 +330,8 @@ cdef class _Timestamp(datetime): return np.datetime64(self.value, 'ns') def __add__(self, other): - cdef int64_t other_int, nanos + cdef: + int64_t other_int, nanos if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').view('i8') @@ -323,6 +339,8 @@ cdef class _Timestamp(datetime): tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): + maybe_integer_op_deprecated(self) + if self is NaT: # to be compat with Period return NaT @@ -358,13 +376,15 @@ cdef class _Timestamp(datetime): neg_other = -other return self + neg_other + typ = getattr(other, '_typ', None) + # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex - elif getattr(other, '_typ', None) == 'datetimeindex': + if typ in ('datetimeindex', 'datetimearray'): # timezone comparison is performed in DatetimeIndex._sub_datelike return -other.__sub__(self) # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex - elif getattr(other, '_typ', None) == 'timedeltaindex': + elif typ in ('timedeltaindex', 'timedeltaarray'): return (-other).__add__(self) elif other is NaT: @@ -398,7 +418,7 @@ cdef class _Timestamp(datetime): int64_t val val = self.value if self.tz is not None and not is_utc(self.tz): - val = tz_convert_single(self.value, 'UTC', self.tz) + val = tz_convert_single(self.value, UTC, self.tz) return val cpdef bint _get_start_end_field(self, str field): @@ -615,7 +635,7 @@ class Timestamp(_Timestamp): Return a new Timestamp representing UTC day and time. """ - return cls.now('UTC') + return cls.now(UTC) @classmethod def utcfromtimestamp(cls, ts): @@ -683,6 +703,9 @@ class Timestamp(_Timestamp): elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') + # User passed tzinfo instead of tz; avoid silently ignoring + tz, tzinfo = tzinfo, None + if is_string_object(ts_input): # User passed a date string to parse. # Check that the user didn't also pass a date attribute kwarg. @@ -692,36 +715,38 @@ class Timestamp(_Timestamp): elif ts_input is _no_input: # User passed keyword arguments. - if tz is None: - # Handle the case where the user passes `tz` and not `tzinfo` - tz = tzinfo - return Timestamp(datetime(year, month, day, hour or 0, - minute or 0, second or 0, - microsecond or 0, tzinfo), - nanosecond=nanosecond, tz=tz) + ts_input = datetime(year, month, day, hour or 0, + minute or 0, second or 0, + microsecond or 0) elif is_integer_object(freq): # User passed positional arguments: # Timestamp(year, month, day[, hour[, minute[, second[, # microsecond[, nanosecond[, tzinfo]]]]]]) - return Timestamp(datetime(ts_input, freq, tz, unit or 0, - year or 0, month or 0, day or 0, - minute), nanosecond=hour, tz=minute) + ts_input = datetime(ts_input, freq, tz, unit or 0, + year or 0, month or 0, day or 0) + nanosecond = hour + tz = minute + freq = None - if tzinfo is not None: - # User passed tzinfo instead of tz; avoid silently ignoring - tz, tzinfo = tzinfo, None + if getattr(ts_input, 'tzinfo', None) is not None and tz is not None: + warnings.warn("Passing a datetime or Timestamp with tzinfo and the" + " tz parameter will raise in the future. Use" + " tz_convert instead.", FutureWarning) ts = convert_to_tsobject(ts_input, tz, unit, 0, 0, nanosecond or 0) if ts.value == NPY_NAT: return NaT - if is_string_object(freq): + if freq is None: + # GH 22311: Try to extract the frequency of a given Timestamp input + freq = getattr(ts_input, 'freq', None) + elif not is_offset_object(freq): freq = to_offset(freq) return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) - def _round(self, freq, mode, ambiguous='raise'): + def _round(self, freq, mode, ambiguous='raise', nonexistent='raise'): if self.tz is not None: value = self.tz_localize(None).value else: @@ -733,10 +758,12 @@ class Timestamp(_Timestamp): r = round_nsint64(value, mode, freq)[0] result = Timestamp(r, unit='ns') if self.tz is not None: - result = result.tz_localize(self.tz, ambiguous=ambiguous) + result = result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) return result - def round(self, freq, ambiguous='raise'): + def round(self, freq, ambiguous='raise', nonexistent='raise'): """ Round the Timestamp to the specified resolution @@ -754,14 +781,27 @@ class Timestamp(_Timestamp): - 'raise' will raise an AmbiguousTimeError for an ambiguous time .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 Raises ------ ValueError if the freq cannot be converted """ - return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous) + return self._round( + freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent + ) - def floor(self, freq, ambiguous='raise'): + def floor(self, freq, ambiguous='raise', nonexistent='raise'): """ return a new Timestamp floored to this resolution @@ -775,14 +815,25 @@ class Timestamp(_Timestamp): - 'raise' will raise an AmbiguousTimeError for an ambiguous time .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 Raises ------ ValueError if the freq cannot be converted """ - return self._round(freq, RoundTo.MINUS_INFTY, ambiguous) + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) - def ceil(self, freq, ambiguous='raise'): + def ceil(self, freq, ambiguous='raise', nonexistent='raise'): """ return a new Timestamp ceiled to this resolution @@ -796,12 +847,23 @@ class Timestamp(_Timestamp): - 'raise' will raise an AmbiguousTimeError for an ambiguous time .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 Raises ------ ValueError if the freq cannot be converted """ - return self._round(freq, RoundTo.PLUS_INFTY, ambiguous) + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) @property def tz(self): @@ -961,7 +1023,8 @@ class Timestamp(_Timestamp): def is_leap_year(self): return bool(ccalendar.is_leapyear(self.year)) - def tz_localize(self, tz, ambiguous='raise', errors='raise'): + def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', + errors=None): """ Convert naive Timestamp to local time zone, or remove timezone from tz-aware Timestamp. @@ -973,19 +1036,38 @@ class Timestamp(_Timestamp): None will remove timezone holding local time. ambiguous : bool, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + - bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates) - 'NaT' will return NaT for an ambiguous time - 'raise' will raise an AmbiguousTimeError for an ambiguous time - errors : 'raise', 'coerce', default 'raise' + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 + + errors : 'raise', 'coerce', default None - 'raise' will raise a NonExistentTimeError if a timestamp is not valid in the specified timezone (e.g. due to a transition from - or to DST time) + or to DST time). Use ``nonexistent='raise'`` instead. - 'coerce' will return NaT if the timestamp can not be converted - into the specified timezone + into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. versionadded:: 0.19.0 + .. deprecated:: 0.24.0 Returns ------- @@ -999,18 +1081,36 @@ class Timestamp(_Timestamp): if ambiguous == 'infer': raise ValueError('Cannot infer offset with only one time.') + if errors is not None: + warnings.warn("The errors argument is deprecated and will be " + "removed in a future release. Use " + "nonexistent='NaT' or nonexistent='raise' " + "instead.", FutureWarning) + if errors == 'coerce': + nonexistent = 'NaT' + elif errors == 'raise': + nonexistent = 'raise' + else: + raise ValueError("The errors argument must be either 'coerce' " + "or 'raise'.") + + if nonexistent not in ('raise', 'NaT', 'shift'): + raise ValueError("The nonexistent argument must be one of 'raise'," + " 'NaT' or 'shift'") + if self.tzinfo is None: # tz naive, localize tz = maybe_get_tz(tz) if not is_string_object(ambiguous): ambiguous = [ambiguous] value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz, - ambiguous=ambiguous, errors=errors)[0] + ambiguous=ambiguous, + nonexistent=nonexistent)[0] return Timestamp(value, tz=tz) else: if tz is None: # reset tz - value = tz_convert_single(self.value, 'UTC', self.tz) + value = tz_convert_single(self.value, UTC, self.tz) return Timestamp(value, tz=None) else: raise TypeError('Cannot localize tz-aware Timestamp, use ' @@ -1060,7 +1160,7 @@ class Timestamp(_Timestamp): minute : int, optional second : int, optional microsecond : int, optional - nanosecond: int, optional + nanosecond : int, optional tzinfo : tz-convertible, optional fold : int, optional, default is 0 added in 3.6, NotImplemented @@ -1080,7 +1180,7 @@ class Timestamp(_Timestamp): _tzinfo = self.tzinfo value = self.value if _tzinfo is not None: - value_tz = tz_convert_single(value, _tzinfo, 'UTC') + value_tz = tz_convert_single(value, _tzinfo, UTC) value += value - value_tz # setup components @@ -1190,6 +1290,10 @@ class Timestamp(_Timestamp): Normalize Timestamp to midnight, preserving tz information. """ + if self.tz is None or is_utc(self.tz): + DAY_NS = DAY_SECONDS * 1000000000 + normalized_value = self.value - (self.value % DAY_NS) + return Timestamp(normalized_value).tz_localize(self.tz) normalized_value = normalize_i8_timestamps( np.array([self.value], dtype='i8'), tz=self.tz)[0] return Timestamp(normalized_value).tz_localize(self.tz) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 8965b46f747c4..50c4a41f97a82 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -cdef bint is_utc(object tz) +cpdef bint is_utc(object tz) cdef bint is_tzlocal(object tz) cdef bint treat_tz_as_pytz(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index b7e4de81da35c..9f8922b274abd 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -2,6 +2,8 @@ from cython import Py_ssize_t +from cpython.datetime cimport tzinfo + # dateutil compat from dateutil.tz import ( tzutc as _dateutil_tzutc, @@ -27,7 +29,7 @@ cdef int64_t NPY_NAT = get_nat() # ---------------------------------------------------------------------- -cdef inline bint is_utc(object tz): +cpdef inline bint is_utc(object tz): return tz is UTC or isinstance(tz, _dateutil_tzutc) @@ -36,8 +38,8 @@ cdef inline bint is_tzlocal(object tz): cdef inline bint treat_tz_as_pytz(object tz): - return hasattr(tz, '_utc_transition_times') and hasattr( - tz, '_transition_info') + return (hasattr(tz, '_utc_transition_times') and + hasattr(tz, '_transition_info')) cdef inline bint treat_tz_as_dateutil(object tz): @@ -58,7 +60,7 @@ cpdef inline object get_timezone(object tz): UJSON/pytables. maybe_get_tz (below) is the inverse of this process. """ if is_utc(tz): - return 'UTC' + return tz else: if treat_tz_as_dateutil(tz): if '.tar.gz' in tz._filename: @@ -322,7 +324,7 @@ cpdef bint tz_compare(object start, object end): return get_timezone(start) == get_timezone(end) -cpdef tz_standardize(object tz): +def tz_standardize(tz: object): """ If the passed tz is a pytz timezone object, "normalize" it to the a consistent version diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index d4b61b8611b68..f517e0933264a 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -9,14 +9,15 @@ from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport ndarray, double_t, int64_t, float64_t, float32_t +from numpy cimport ndarray, int64_t, float64_t, float32_t cnp.import_array() cdef extern from "src/headers/cmath" namespace "std": - bint isnan(double) nogil - int signbit(double) nogil - double sqrt(double x) nogil + bint isnan(float64_t) nogil + bint notnan(float64_t) nogil + int signbit(float64_t) nogil + float64_t sqrt(float64_t x) nogil cimport util from util cimport numeric @@ -31,7 +32,7 @@ cdef float64_t MINfloat64 = np.NINF cdef float32_t MAXfloat32 = np.inf cdef float64_t MAXfloat64 = np.inf -cdef double NaN = np.NaN +cdef float64_t NaN = np.NaN cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b @@ -79,6 +80,7 @@ def _check_minp(win, minp, N, floor=None): return max(minp, floor) + # original C implementation by N. Devillard. # This code in public domain. # Function : kth_smallest() @@ -121,14 +123,14 @@ cdef class MockFixedWindowIndexer(WindowIndexer): Parameters ---------- - input: ndarray - input data array + values: ndarray + values data array win: int64_t window size minp: int64_t min number of obs in a window to consider non-NaN index: object - index of the input + index of the values floor: optional unit for flooring left_closed: bint @@ -137,13 +139,13 @@ cdef class MockFixedWindowIndexer(WindowIndexer): right endpoint closedness """ - def __init__(self, ndarray input, int64_t win, int64_t minp, + def __init__(self, ndarray values, int64_t win, int64_t minp, bint left_closed, bint right_closed, object index=None, object floor=None): assert index is None self.is_variable = 0 - self.N = len(input) + self.N = len(values) self.minp = _check_minp(win, minp, self.N, floor=floor) self.start = np.empty(0, dtype='int64') self.end = np.empty(0, dtype='int64') @@ -159,14 +161,14 @@ cdef class FixedWindowIndexer(WindowIndexer): Parameters ---------- - input: ndarray - input data array + values: ndarray + values data array win: int64_t window size minp: int64_t min number of obs in a window to consider non-NaN index: object - index of the input + index of the values floor: optional unit for flooring the unit left_closed: bint @@ -175,14 +177,14 @@ cdef class FixedWindowIndexer(WindowIndexer): right endpoint closedness """ - def __init__(self, ndarray input, int64_t win, int64_t minp, + def __init__(self, ndarray values, int64_t win, int64_t minp, bint left_closed, bint right_closed, object index=None, object floor=None): cdef ndarray start_s, start_e, end_s, end_e assert index is None self.is_variable = 0 - self.N = len(input) + self.N = len(values) self.minp = _check_minp(win, minp, self.N, floor=floor) start_s = np.zeros(win, dtype='int64') @@ -204,14 +206,14 @@ cdef class VariableWindowIndexer(WindowIndexer): Parameters ---------- - input: ndarray - input data array + values: ndarray + values data array win: int64_t window size minp: int64_t min number of obs in a window to consider non-NaN index: ndarray - index of the input + index of the values left_closed: bint left endpoint closedness True if the left endpoint is closed, False if open @@ -221,7 +223,7 @@ cdef class VariableWindowIndexer(WindowIndexer): floor: optional unit for flooring the unit """ - def __init__(self, ndarray input, int64_t win, int64_t minp, + def __init__(self, ndarray values, int64_t win, int64_t minp, bint left_closed, bint right_closed, ndarray index, object floor=None): @@ -293,18 +295,18 @@ cdef class VariableWindowIndexer(WindowIndexer): end[i] -= 1 -def get_window_indexer(input, win, minp, index, closed, +def get_window_indexer(values, win, minp, index, closed, floor=None, use_mock=True): """ return the correct window indexer for the computation Parameters ---------- - input: 1d ndarray + values: 1d ndarray win: integer, window size minp: integer, minimum periods index: 1d ndarray, optional - index to the input array + index to the values array closed: string, default None {'right', 'left', 'both', 'neither'} window endpoint closedness. Defaults to 'right' in @@ -341,31 +343,32 @@ def get_window_indexer(input, win, minp, index, closed, left_closed = True if index is not None: - indexer = VariableWindowIndexer(input, win, minp, left_closed, + indexer = VariableWindowIndexer(values, win, minp, left_closed, right_closed, index, floor) elif use_mock: - indexer = MockFixedWindowIndexer(input, win, minp, left_closed, + indexer = MockFixedWindowIndexer(values, win, minp, left_closed, right_closed, index, floor) else: - indexer = FixedWindowIndexer(input, win, minp, left_closed, + indexer = FixedWindowIndexer(values, win, minp, left_closed, right_closed, index, floor) return indexer.get_data() + # ---------------------------------------------------------------------- # Rolling count # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, count_x = 0.0 + float64_t val, count_x = 0.0 int64_t s, e, nobs, N Py_ssize_t i, j ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output - start, end, N, win, minp, _ = get_window_indexer(input, win, + start, end, N, win, minp, _ = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -380,22 +383,22 @@ def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, # setup count_x = 0.0 for j in range(s, e): - val = input[j] - if val == val: + val = values[j] + if notnan(val): count_x += 1.0 else: # calculate deletes for j in range(start[i - 1], s): - val = input[j] - if val == val: + val = values[j] + if notnan(val): count_x -= 1.0 # calculate adds for j in range(end[i - 1], e): - val = input[j] - if val == val: + val = values[j] + if notnan(val): count_x += 1.0 if count_x >= minp: @@ -405,12 +408,15 @@ def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, return output + # ---------------------------------------------------------------------- # Rolling sum -cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: - cdef double result +cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, + float64_t sum_x) nogil: + cdef: + float64_t result if nobs >= minp: result = sum_x @@ -420,34 +426,35 @@ cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: return result -cdef inline void add_sum(double val, int64_t *nobs, double *sum_x) nogil: +cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: """ add a value from the sum calc """ # Not NaN - if val == val: + if notnan(val): nobs[0] = nobs[0] + 1 sum_x[0] = sum_x[0] + val -cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: +cdef inline void remove_sum(float64_t val, + int64_t *nobs, float64_t *sum_x) nogil: """ remove a value from the sum calc """ - if val == val: + if notnan(val): nobs[0] = nobs[0] - 1 sum_x[0] = sum_x[0] - val -def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_sum(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev_x, sum_x = 0 + float64_t val, prev_x, sum_x = 0 int64_t s, e, range_endpoint int64_t nobs = 0, i, j, N bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed, floor=0) @@ -472,17 +479,17 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, sum_x = 0.0 nobs = 0 for j in range(s, e): - add_sum(input[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x) else: # calculate deletes for j in range(start[i - 1], s): - remove_sum(input[j], &nobs, &sum_x) + remove_sum(values[j], &nobs, &sum_x) # calculate adds for j in range(end[i - 1], e): - add_sum(input[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x) output[i] = calc_sum(minp, nobs, sum_x) @@ -495,31 +502,33 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(0, range_endpoint): - add_sum(input[i], &nobs, &sum_x) + add_sum(values[i], &nobs, &sum_x) output[i] = NaN for i in range(range_endpoint, N): - val = input[i] + val = values[i] add_sum(val, &nobs, &sum_x) if i > win - 1: - prev_x = input[i - win] + prev_x = values[i - win] remove_sum(prev_x, &nobs, &sum_x) output[i] = calc_sum(minp, nobs, sum_x) return output + # ---------------------------------------------------------------------- # Rolling mean -cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, - Py_ssize_t neg_ct, double sum_x) nogil: - cdef double result +cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, + Py_ssize_t neg_ct, float64_t sum_x) nogil: + cdef: + float64_t result if nobs >= minp: - result = sum_x / nobs + result = sum_x / nobs if neg_ct == 0 and result < 0: # all positive result = 0 @@ -533,40 +542,40 @@ cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, return result -cdef inline void add_mean(double val, Py_ssize_t *nobs, double *sum_x, +cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, Py_ssize_t *neg_ct) nogil: """ add a value from the mean calc """ # Not NaN - if val == val: + if notnan(val): nobs[0] = nobs[0] + 1 sum_x[0] = sum_x[0] + val if signbit(val): neg_ct[0] = neg_ct[0] + 1 -cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, +cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, Py_ssize_t *neg_ct) nogil: """ remove a value from the mean calc """ - if val == val: + if notnan(val): nobs[0] = nobs[0] - 1 sum_x[0] = sum_x[0] - val if signbit(val): neg_ct[0] = neg_ct[0] - 1 -def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_mean(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev_x, result, sum_x = 0 + float64_t val, prev_x, result, sum_x = 0 int64_t s, e bint is_variable Py_ssize_t nobs = 0, i, j, neg_ct = 0, N ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -589,19 +598,19 @@ def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, sum_x = 0.0 nobs = 0 for j in range(s, e): - val = input[j] + val = values[j] add_mean(val, &nobs, &sum_x, &neg_ct) else: # calculate deletes for j in range(start[i - 1], s): - val = input[j] + val = values[j] remove_mean(val, &nobs, &sum_x, &neg_ct) # calculate adds for j in range(end[i - 1], e): - val = input[j] + val = values[j] add_mean(val, &nobs, &sum_x, &neg_ct) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) @@ -610,29 +619,31 @@ def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(minp - 1): - val = input[i] + val = values[i] add_mean(val, &nobs, &sum_x, &neg_ct) output[i] = NaN for i in range(minp - 1, N): - val = input[i] + val = values[i] add_mean(val, &nobs, &sum_x, &neg_ct) if i > win - 1: - prev_x = input[i - win] + prev_x = values[i - win] remove_mean(prev_x, &nobs, &sum_x, &neg_ct) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) return output + # ---------------------------------------------------------------------- # Rolling variance -cdef inline double calc_var(int64_t minp, int ddof, double nobs, - double ssqdm_x) nogil: - cdef double result +cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, + float64_t ssqdm_x) nogil: + cdef: + float64_t result # Variance is unchanged if no observation is added or removed if (nobs >= minp) and (nobs > ddof): @@ -641,7 +652,7 @@ cdef inline double calc_var(int64_t minp, int ddof, double nobs, if nobs == 1: result = 0 else: - result = ssqdm_x / (nobs - ddof) + result = ssqdm_x / (nobs - ddof) if result < 0: result = 0 else: @@ -650,10 +661,12 @@ cdef inline double calc_var(int64_t minp, int ddof, double nobs, return result -cdef inline void add_var(double val, double *nobs, double *mean_x, - double *ssqdm_x) nogil: +cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, + float64_t *ssqdm_x) nogil: """ add a value from the var calc """ - cdef double delta + cdef: + float64_t delta + # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug if isnan(val): return @@ -666,13 +679,13 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] -cdef inline void remove_var(double val, double *nobs, double *mean_x, - double *ssqdm_x) nogil: +cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, + float64_t *ssqdm_x) nogil: """ remove a value from the var calc """ - cdef double delta + cdef: + float64_t delta - # Not NaN - if val == val: + if notnan(val): nobs[0] = nobs[0] - 1 if nobs[0]: # a part of Welford's method for the online variance-calculation @@ -685,20 +698,21 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, ssqdm_x[0] = 0 -def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed, int ddof=1): """ Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old + float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, + float64_t val, prev, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -724,7 +738,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, if i == 0: for j in range(s, e): - add_var(input[j], &nobs, &mean_x, &ssqdm_x) + add_var(values[j], &nobs, &mean_x, &ssqdm_x) else: @@ -733,11 +747,11 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # calculate adds for j in range(end[i - 1], e): - add_var(input[j], &nobs, &mean_x, &ssqdm_x) + add_var(values[j], &nobs, &mean_x, &ssqdm_x) # calculate deletes for j in range(start[i - 1], s): - remove_var(input[j], &nobs, &mean_x, &ssqdm_x) + remove_var(values[j], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) @@ -748,7 +762,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Over the first window, observations can only be added, never # removed for i in range(win): - add_var(input[i], &nobs, &mean_x, &ssqdm_x) + add_var(values[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) # a part of Welford's method for the online variance-calculation @@ -757,10 +771,10 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # After the first window, observations can both be added and # removed for i in range(win, N): - val = input[i] - prev = input[i - win] + val = values[i] + prev = values[i - win] - if val == val: + if notnan(val): if prev == prev: # Adding one observation and removing another one @@ -785,13 +799,15 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # ---------------------------------------------------------------------- # Rolling skewness -cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, - double xxx) nogil: - cdef double result, dnobs - cdef double A, B, C, R +cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, + float64_t x, float64_t xx, + float64_t xxx) nogil: + cdef: + float64_t result, dnobs + float64_t A, B, C, R if nobs >= minp: - dnobs = nobs + dnobs = nobs A = x / dnobs B = xx / dnobs - A * A C = xxx / dnobs - A * A * A - 3 * A * B @@ -817,12 +833,13 @@ cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, return result -cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, - double *xxx) nogil: +cdef inline void add_skew(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx) nogil: """ add a value from the skew calc """ # Not NaN - if val == val: + if notnan(val): nobs[0] = nobs[0] + 1 # seriously don't ask me why this is faster @@ -831,12 +848,13 @@ cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, xxx[0] = xxx[0] + val * val * val -cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, - double *xxx) nogil: +cdef inline void remove_skew(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx) nogil: """ remove a value from the skew calc """ # Not NaN - if val == val: + if notnan(val): nobs[0] = nobs[0] - 1 # seriously don't ask me why this is faster @@ -845,18 +863,18 @@ cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, xxx[0] = xxx[0] - val * val * val -def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_skew(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev - double x = 0, xx = 0, xxx = 0 + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0 int64_t nobs = 0, i, j, N int64_t s, e bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -875,7 +893,7 @@ def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, if i == 0: for j in range(s, e): - val = input[j] + val = values[j] add_skew(val, &nobs, &x, &xx, &xxx) else: @@ -885,12 +903,12 @@ def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, # calculate adds for j in range(end[i - 1], e): - val = input[j] + val = values[j] add_skew(val, &nobs, &x, &xx, &xxx) # calculate deletes for j in range(start[i - 1], s): - val = input[j] + val = values[j] remove_skew(val, &nobs, &x, &xx, &xxx) output[i] = calc_skew(minp, nobs, x, xx, xxx) @@ -899,33 +917,36 @@ def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(minp - 1): - val = input[i] + val = values[i] add_skew(val, &nobs, &x, &xx, &xxx) output[i] = NaN for i in range(minp - 1, N): - val = input[i] + val = values[i] add_skew(val, &nobs, &x, &xx, &xxx) if i > win - 1: - prev = input[i - win] + prev = values[i - win] remove_skew(prev, &nobs, &x, &xx, &xxx) output[i] = calc_skew(minp, nobs, x, xx, xxx) return output + # ---------------------------------------------------------------------- # Rolling kurtosis -cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, - double xxx, double xxxx) nogil: - cdef double result, dnobs - cdef double A, B, C, D, R, K +cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs, + float64_t x, float64_t xx, + float64_t xxx, float64_t xxxx) nogil: + cdef: + float64_t result, dnobs + float64_t A, B, C, D, R, K if nobs >= minp: - dnobs = nobs + dnobs = nobs A = x / dnobs R = A * A B = xx / dnobs - R @@ -954,12 +975,13 @@ cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, return result -cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, - double *xxx, double *xxxx) nogil: +cdef inline void add_kurt(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx, float64_t *xxxx) nogil: """ add a value from the kurotic calc """ # Not NaN - if val == val: + if notnan(val): nobs[0] = nobs[0] + 1 # seriously don't ask me why this is faster @@ -969,12 +991,13 @@ cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, xxxx[0] = xxxx[0] + val * val * val * val -cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, - double *xxx, double *xxxx) nogil: +cdef inline void remove_kurt(float64_t val, int64_t *nobs, + float64_t *x, float64_t *xx, + float64_t *xxx, float64_t *xxxx) nogil: """ remove a value from the kurotic calc """ # Not NaN - if val == val: + if notnan(val): nobs[0] = nobs[0] - 1 # seriously don't ask me why this is faster @@ -984,18 +1007,18 @@ cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, xxxx[0] = xxxx[0] - val * val * val * val -def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, prev - double x = 0, xx = 0, xxx = 0, xxxx = 0 + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 int64_t nobs = 0, i, j, N int64_t s, e bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -1014,7 +1037,7 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, if i == 0: for j in range(s, e): - add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) else: @@ -1023,11 +1046,11 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, # calculate adds for j in range(end[i - 1], e): - add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) # calculate deletes for j in range(start[i - 1], s): - remove_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) @@ -1036,28 +1059,29 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(minp - 1): - add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) output[i] = NaN for i in range(minp - 1, N): - add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) if i > win - 1: - prev = input[i - win] + prev = values[i - win] remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) return output + # ---------------------------------------------------------------------- # Rolling median, min, max -def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, +def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: - double val, res, prev + float64_t val, res, prev bint err = 0, is_variable int ret = 0 skiplist_t *sl @@ -1065,12 +1089,12 @@ def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, int64_t nobs = 0, N, s, e int midpoint ndarray[int64_t] start, end - ndarray[double_t] output + ndarray[float64_t] output # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs start, end, N, win, minp, is_variable = get_window_indexer( - input, win, + values, win, minp, index, closed, use_mock=False) output = np.empty(N, dtype=float) @@ -1088,8 +1112,8 @@ def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, if i == 0: # setup - val = input[i] - if val == val: + val = values[i] + if notnan(val): nobs += 1 err = skiplist_insert(sl, val) != 1 if err: @@ -1099,15 +1123,15 @@ def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, # calculate deletes for j in range(start[i - 1], s): - val = input[j] - if val == val: + val = values[j] + if notnan(val): skiplist_remove(sl, val) nobs -= 1 # calculate adds for j in range(end[i - 1], e): - val = input[j] - if val == val: + val = values[j] + if notnan(val): nobs += 1 err = skiplist_insert(sl, val) != 1 if err: @@ -1130,6 +1154,7 @@ def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, raise MemoryError("skiplist_insert failed") return output + # ---------------------------------------------------------------------- # Moving maximum / minimum code taken from Bottleneck under the terms @@ -1167,7 +1192,8 @@ cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, numeric value) nogil: - cdef numeric result + cdef: + numeric result if numeric in cython.floating: if nobs >= minp: @@ -1180,14 +1206,14 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, return result -def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, +def roll_max(ndarray[numeric] values, int64_t win, int64_t minp, object index, object closed): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - input: numpy array + values: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN @@ -1197,27 +1223,27 @@ def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, make the interval closed on the right, left, both or neither endpoints """ - return _roll_min_max(input, win, minp, index, closed=closed, is_max=1) + return _roll_min_max(values, win, minp, index, closed=closed, is_max=1) -def roll_min(ndarray[numeric] input, int64_t win, int64_t minp, +def roll_min(ndarray[numeric] values, int64_t win, int64_t minp, object index, object closed): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - input: numpy array + values: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN index: ndarray, optional index for window computation """ - return _roll_min_max(input, win, minp, index, is_max=0, closed=closed) + return _roll_min_max(values, win, minp, index, is_max=0, closed=closed) -cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, +cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, object index, object closed, bint is_max): """ Moving min/max of 1d array of any numeric type along axis=0 @@ -1229,17 +1255,17 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, bint is_variable starti, endi, N, win, minp, is_variable = get_window_indexer( - input, win, + values, win, minp, index, closed) if is_variable: - return _roll_min_max_variable(input, starti, endi, N, win, minp, + return _roll_min_max_variable(values, starti, endi, N, win, minp, is_max) else: - return _roll_min_max_fixed(input, starti, endi, N, win, minp, is_max) + return _roll_min_max_fixed(values, starti, endi, N, win, minp, is_max) -cdef _roll_min_max_variable(ndarray[numeric] input, +cdef _roll_min_max_variable(ndarray[numeric] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t N, @@ -1252,7 +1278,7 @@ cdef _roll_min_max_variable(ndarray[numeric] input, Py_ssize_t nobs = 0 deque Q[int64_t] # min/max always the front deque W[int64_t] # track the whole window for nobs compute - ndarray[double_t, ndim=1] output + ndarray[float64_t, ndim=1] output output = np.empty(N, dtype=float) Q = deque[int64_t]() @@ -1266,16 +1292,16 @@ cdef _roll_min_max_variable(ndarray[numeric] input, # So the code was optimized for that for i from starti[0] <= i < endi[0]: - ai = init_mm(input[i], &nobs, is_max) + ai = init_mm(values[i], &nobs, is_max) # Discard previous entries if we find new min or max if is_max: - while not Q.empty() and ((ai >= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ((ai >= values[Q.back()]) or + values[Q.back()] != values[Q.back()]): Q.pop_back() else: - while not Q.empty() and ((ai <= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ((ai <= values[Q.back()]) or + values[Q.back()] != values[Q.back()]): Q.pop_back() Q.push_back(i) W.push_back(i) @@ -1286,20 +1312,20 @@ cdef _roll_min_max_variable(ndarray[numeric] input, for i in range(endi[0], endi[N-1]): if not Q.empty(): output[i-1+close_offset] = calc_mm( - minp, nobs, input[Q.front()]) + minp, nobs, values[Q.front()]) else: output[i-1+close_offset] = NaN - ai = init_mm(input[i], &nobs, is_max) + ai = init_mm(values[i], &nobs, is_max) # Discard previous entries if we find new min or max if is_max: - while not Q.empty() and ((ai >= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ((ai >= values[Q.back()]) or + values[Q.back()] != values[Q.back()]): Q.pop_back() else: - while not Q.empty() and ((ai <= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ((ai <= values[Q.back()]) or + values[Q.back()] != values[Q.back()]): Q.pop_back() # Maintain window/nobs retention @@ -1307,18 +1333,18 @@ cdef _roll_min_max_variable(ndarray[numeric] input, while not Q.empty() and Q.front() <= i - curr_win_size: Q.pop_front() while not W.empty() and W.front() <= i - curr_win_size: - remove_mm(input[W.front()], &nobs) + remove_mm(values[W.front()], &nobs) W.pop_front() Q.push_back(i) W.push_back(i) - output[N-1] = calc_mm(minp, nobs, input[Q.front()]) + output[N-1] = calc_mm(minp, nobs, values[Q.front()]) return output -cdef _roll_min_max_fixed(ndarray[numeric] input, +cdef _roll_min_max_fixed(ndarray[numeric] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t N, @@ -1335,7 +1361,7 @@ cdef _roll_min_max_fixed(ndarray[numeric] input, numeric* minvalue numeric* end numeric* last - ndarray[double_t, ndim=1] output + ndarray[float64_t, ndim=1] output output = np.empty(N, dtype=float) # setup the rings of death! @@ -1345,18 +1371,18 @@ cdef _roll_min_max_fixed(ndarray[numeric] input, end = ring + win last = ring minvalue = ring - ai = input[0] - minvalue[0] = init_mm(input[0], &nobs, is_max) + ai = values[0] + minvalue[0] = init_mm(values[0], &nobs, is_max) death[0] = win nobs = 0 with nogil: for i in range(N): - ai = init_mm(input[i], &nobs, is_max) + ai = init_mm(values[i], &nobs, is_max) if i >= win: - remove_mm(input[i - win], &nobs) + remove_mm(values[i - win], &nobs) if death[minvalue - ring] == i: minvalue = minvalue + 1 @@ -1425,21 +1451,21 @@ interpolation_types = { } -def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, +def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, int64_t minp, object index, object closed, - double quantile, str interpolation): + float64_t quantile, str interpolation): """ O(N log(window)) implementation using skip list """ cdef: - double val, prev, midpoint, idx_with_fraction + float64_t val, prev, midpoint, idx_with_fraction skiplist_t *skiplist int64_t nobs = 0, i, j, s, e, N Py_ssize_t idx bint is_variable ndarray[int64_t] start, end - ndarray[double_t] output - double vlow, vhigh + ndarray[float64_t] output + float64_t vlow, vhigh InterpolationType interpolation_type int ret = 0 @@ -1449,13 +1475,13 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, try: interpolation_type = interpolation_types[interpolation] except KeyError: - raise ValueError("Interpolation '{}' is not supported" - .format(interpolation)) + raise ValueError("Interpolation '{interp}' is not supported" + .format(interp=interpolation)) # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs start, end, N, win, minp, is_variable = get_window_indexer( - input, win, + values, win, minp, index, closed, use_mock=False) output = np.empty(N, dtype=float) @@ -1471,8 +1497,8 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, if i == 0: # setup - val = input[i] - if val == val: + val = values[i] + if notnan(val): nobs += 1 skiplist_insert(skiplist, val) @@ -1480,15 +1506,15 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, # calculate deletes for j in range(start[i - 1], s): - val = input[j] - if val == val: + val = values[j] + if notnan(val): skiplist_remove(skiplist, val) nobs -= 1 # calculate adds for j in range(end[i - 1], e): - val = input[j] - if val == val: + val = values[j] + if notnan(val): nobs += 1 skiplist_insert(skiplist, val) @@ -1498,7 +1524,7 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, output[i] = skiplist_get(skiplist, 0, &ret) else: idx_with_fraction = quantile * (nobs - 1) - idx = idx_with_fraction + idx = idx_with_fraction if idx_with_fraction == idx: # no need to interpolate @@ -1529,7 +1555,7 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, elif interpolation_type == MIDPOINT: vlow = skiplist_get(skiplist, idx, &ret) vhigh = skiplist_get(skiplist, idx + 1, &ret) - output[i] = (vlow + vhigh) / 2 + output[i] = (vlow + vhigh) / 2 else: output[i] = NaN @@ -1543,7 +1569,7 @@ def roll_generic(object obj, int offset, object func, bint raw, object args, object kwargs): cdef: - ndarray[double_t] output, counts, bufarr + ndarray[float64_t] output, counts, bufarr ndarray[float64_t, cast=True] arr float64_t *buf float64_t *oldbuf @@ -1611,17 +1637,17 @@ def roll_generic(object obj, output[i] = NaN # remaining full-length windows - buf = arr.data + buf = arr.data bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data + oldbuf = bufarr.data for i from (win - offset) <= i < (N - offset): buf = buf + 1 - bufarr.data = buf + bufarr.data = buf if counts[i] >= minp: output[i] = func(bufarr, *args, **kwargs) else: output[i] = NaN - bufarr.data = oldbuf + bufarr.data = oldbuf # truncated windows at the end for i from int_max(N - offset, 0) <= i < N: @@ -1635,18 +1661,18 @@ def roll_generic(object obj, return output -def roll_window(ndarray[float64_t, ndim=1, cast=True] input, +def roll_window(ndarray[float64_t, ndim=1, cast=True] values, ndarray[float64_t, ndim=1, cast=True] weights, int minp, bint avg=True): """ - Assume len(weights) << len(input) + Assume len(weights) << len(values) """ cdef: - ndarray[double_t] output, tot_wgt, counts + ndarray[float64_t] output, tot_wgt, counts Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k float64_t val_in, val_win, c, w - in_n = len(input) + in_n = len(values) win_n = len(weights) output = np.zeros(in_n, dtype=float) counts = np.zeros(in_n, dtype=float) @@ -1662,7 +1688,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, continue for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] + val_in = values[in_i] if val_in == val_in: output[in_i + (win_n - win_i) - 1] += val_in * val_win counts[in_i + (win_n - win_i) - 1] += 1 @@ -1686,7 +1712,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, continue for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] + val_in = values[in_i] if val_in == val_in: output[in_i + (win_n - win_i) - 1] += val_in * val_win @@ -1703,7 +1729,8 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, # Exponentially weighted moving average -def ewma(double_t[:] vals, double_t com, int adjust, int ignore_na, int minp): +def ewma(float64_t[:] vals, float64_t com, + int adjust, int ignore_na, int minp): """ Compute exponentially-weighted moving average using center-of-mass. @@ -1722,8 +1749,8 @@ def ewma(double_t[:] vals, double_t com, int adjust, int ignore_na, int minp): cdef: Py_ssize_t N = len(vals) - ndarray[double_t] output = np.empty(N, dtype=float) - double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + ndarray[float64_t] output = np.empty(N, dtype=float) + float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur Py_ssize_t i, nobs if N == 0: @@ -1767,12 +1794,13 @@ def ewma(double_t[:] vals, double_t com, int adjust, int ignore_na, int minp): return output + # ---------------------------------------------------------------------- # Exponentially weighted moving covariance -def ewmcov(double_t[:] input_x, double_t[:] input_y, - double_t com, int adjust, int ignore_na, int minp, int bias): +def ewmcov(float64_t[:] input_x, float64_t[:] input_y, + float64_t com, int adjust, int ignore_na, int minp, int bias): """ Compute exponentially-weighted moving variance using center-of-mass. @@ -1793,10 +1821,10 @@ def ewmcov(double_t[:] input_x, double_t[:] input_y, cdef: Py_ssize_t N = len(input_x) - double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov - double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y Py_ssize_t i, nobs - ndarray[double_t] output + ndarray[float64_t] output if len(input_y) != N: raise ValueError("arrays are of different lengths " diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 9af12cbec1e9c..6449a331689ad 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -23,7 +23,7 @@ ctypedef fused pandas_string: @cython.boundscheck(False) @cython.wraparound(False) def write_csv_rows(list data, ndarray data_index, - int nlevels, ndarray cols, object writer): + Py_ssize_t nlevels, ndarray cols, object writer): """ Write the given data to the writer object, pre-allocating where possible for performance improvements. @@ -36,21 +36,16 @@ def write_csv_rows(list data, ndarray data_index, cols : ndarray writer : object """ + # In crude testing, N>100 yields little marginal improvement cdef: - int N, j, i, ncols + Py_ssize_t i, j, k = len(data_index), N = 100, ncols = len(cols) list rows - object val - - # In crude testing, N>100 yields little marginal improvement - N = 100 # pre-allocate rows - ncols = len(cols) - rows = [[None] * (nlevels + ncols) for x in range(N)] + rows = [[None] * (nlevels + ncols) for _ in range(N)] - j = -1 if nlevels == 1: - for j in range(len(data_index)): + for j in range(k): row = rows[j % N] row[0] = data_index[j] for i in range(ncols): @@ -59,7 +54,7 @@ def write_csv_rows(list data, ndarray data_index, if j >= N - 1 and j % N == N - 1: writer.writerows(rows) elif nlevels > 1: - for j in range(len(data_index)): + for j in range(k): row = rows[j % N] row[:nlevels] = list(data_index[j]) for i in range(ncols): @@ -68,7 +63,7 @@ def write_csv_rows(list data, ndarray data_index, if j >= N - 1 and j % N == N - 1: writer.writerows(rows) else: - for j in range(len(data_index)): + for j in range(k): row = rows[j % N] for i in range(ncols): row[i] = data[i][j] @@ -90,8 +85,9 @@ def convert_json_to_lines(object arr): cdef: Py_ssize_t i = 0, num_open_brackets_seen = 0, length bint in_quotes = 0, is_escaping = 0 - ndarray[uint8_t] narr - unsigned char v, comma, left_bracket, right_brack, newline + ndarray[uint8_t, ndim=1] narr + unsigned char val, newline, comma, left_bracket, right_bracket, quote + unsigned char backslash newline = ord('\n') comma = ord(',') @@ -103,18 +99,18 @@ def convert_json_to_lines(object arr): narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy() length = narr.shape[0] for i in range(length): - v = narr[i] - if v == quote and i > 0 and not is_escaping: + val = narr[i] + if val == quote and i > 0 and not is_escaping: in_quotes = ~in_quotes - if v == backslash or is_escaping: + if val == backslash or is_escaping: is_escaping = ~is_escaping - if v == comma: # commas that should be \n + if val == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: narr[i] = newline - elif v == left_bracket: + elif val == left_bracket: if not in_quotes: num_open_brackets_seen += 1 - elif v == right_bracket: + elif val == right_bracket: if not in_quotes: num_open_brackets_seen -= 1 @@ -128,16 +124,16 @@ def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t: """ return the maximum size of elements in a 1-dim string array """ cdef: Py_ssize_t i, m = 0, l = 0, length = arr.shape[0] - pandas_string v + pandas_string val for i in range(length): - v = arr[i] - if isinstance(v, str): - l = PyString_GET_SIZE(v) - elif isinstance(v, bytes): - l = PyBytes_GET_SIZE(v) - elif isinstance(v, unicode): - l = PyUnicode_GET_SIZE(v) + val = arr[i] + if isinstance(val, str): + l = PyString_GET_SIZE(val) + elif isinstance(val, bytes): + l = PyBytes_GET_SIZE(val) + elif isinstance(val, unicode): + l = PyUnicode_GET_SIZE(val) if l > m: m = l @@ -159,7 +155,7 @@ def string_array_replace_from_nan_rep( they are 'nan_rep'. Return the same array. """ cdef: - int length = arr.shape[0], i = 0 + Py_ssize_t length = len(arr), i = 0 if replace is None: replace = np.nan diff --git a/pandas/_version.py b/pandas/_version.py index f4c8938c683da..d000539421b91 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -12,6 +12,7 @@ import re import subprocess import sys + from pandas.compat import PY3 @@ -237,14 +238,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): + fmt = ("tag '{full_tag}' doesn't start with prefix " + "'{tag_prefix}'") + msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) if verbose: - fmt = "tag '{full_tag}' doesn't start with prefix " \ - "'{tag_prefix}'" - print(fmt.format(full_tag=full_tag, tag_prefix=tag_prefix)) - pieces["error"] = ("tag '{full_tag}' doesn't start with " - "prefix '{tag_prefix}'".format( - full_tag, tag_prefix)) + print(msg) + pieces["error"] = msg return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 8a515661920f3..cb6241016d82f 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,10 +1,10 @@ -"""Public API for extending panadas objects.""" +"""Public API for extending pandas objects.""" from pandas.core.accessor import (register_dataframe_accessor, # noqa register_index_accessor, register_series_accessor) from pandas.core.algorithms import take # noqa -from pandas.core.arrays.base import (ExtensionArray, # noqa - ExtensionScalarOpsMixin) +from pandas.core.arrays import (ExtensionArray, # noqa + ExtensionScalarOpsMixin) from pandas.core.dtypes.dtypes import ( # noqa ExtensionDtype, register_extension_dtype ) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 1453725225e7d..f9c659106a516 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -139,8 +139,10 @@ def lfilter(*args, **kwargs): Hashable = collections.abc.Hashable Iterable = collections.abc.Iterable Mapping = collections.abc.Mapping + MutableMapping = collections.abc.MutableMapping Sequence = collections.abc.Sequence Sized = collections.abc.Sized + Set = collections.abc.Set else: # Python 2 @@ -199,8 +201,10 @@ def get_range_parameters(data): Hashable = collections.Hashable Iterable = collections.Iterable Mapping = collections.Mapping + MutableMapping = collections.MutableMapping Sequence = collections.Sequence Sized = collections.Sized + Set = collections.Set if PY2: def iteritems(obj, **kw): diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index a6f586c7f2638..5e67cf2ee2837 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -9,19 +9,16 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p10 = _nlv < LooseVersion('1.10') -_np_version_under1p11 = _nlv < LooseVersion('1.11') -_np_version_under1p12 = _nlv < LooseVersion('1.12') _np_version_under1p13 = _nlv < LooseVersion('1.13') _np_version_under1p14 = _nlv < LooseVersion('1.14') _np_version_under1p15 = _nlv < LooseVersion('1.15') -if _nlv < '1.9': +if _nlv < '1.12': raise ImportError('this version of pandas is incompatible with ' - 'numpy < 1.9.0\n' + 'numpy < 1.12.0\n' 'your numpy version is {0}.\n' - 'Please upgrade numpy to >= 1.9.0 to use ' + 'Please upgrade numpy to >= 1.12.0 to use ' 'this pandas version'.format(_np_version)) @@ -43,9 +40,7 @@ def np_datetime64_compat(s, *args, **kwargs): tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation warning, when need to pass '2015-01-01 09:00:00' """ - - if not _np_version_under1p11: - s = tz_replacer(s) + s = tz_replacer(s) return np.datetime64(s, *args, **kwargs) @@ -56,23 +51,17 @@ def np_array_datetime64_compat(arr, *args, **kwargs): tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation warning, when need to pass '2015-01-01 09:00:00' """ - - if not _np_version_under1p11: - - # is_list_like - if hasattr(arr, '__iter__') and not \ - isinstance(arr, string_and_binary_types): - arr = [tz_replacer(s) for s in arr] - else: - arr = tz_replacer(arr) + # is_list_like + if (hasattr(arr, '__iter__') + and not isinstance(arr, string_and_binary_types)): + arr = [tz_replacer(s) for s in arr] + else: + arr = tz_replacer(arr) return np.array(arr, *args, **kwargs) __all__ = ['np', - '_np_version_under1p10', - '_np_version_under1p11', - '_np_version_under1p12', '_np_version_under1p13', '_np_version_under1p14', '_np_version_under1p15' diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index d42be56963569..30fdeca35faf3 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -19,11 +19,13 @@ """ from numpy import ndarray -from pandas.util._validators import (validate_args, validate_kwargs, - validate_args_and_kwargs) -from pandas.errors import UnsupportedFunctionCall -from pandas.core.dtypes.common import is_integer, is_bool + from pandas.compat import OrderedDict +from pandas.errors import UnsupportedFunctionCall +from pandas.util._validators import ( + validate_args, validate_args_and_kwargs, validate_kwargs) + +from pandas.core.dtypes.common import is_bool, is_integer class CompatValidator(object): @@ -360,3 +362,24 @@ def validate_resampler_func(method, args, kwargs): "{func}() instead".format(func=method))) else: raise TypeError("too many arguments passed in") + + +def validate_minmax_axis(axis): + """ + Ensure that the axis argument passed to min, max, argmin, or argmax is + zero or None, as otherwise it will be incorrectly ignored. + + Parameters + ---------- + axis : int or None + + Raises + ------ + ValueError + """ + ndim = 1 # hard-coded for Index + if axis is None: + return + if axis >= ndim or (axis < 0 and ndim + axis < 0): + raise ValueError("`axis` must be fewer than the number of " + "dimensions ({ndim})".format(ndim=ndim)) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 713a5b1120beb..61295b8249f58 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -2,12 +2,14 @@ Support pre-0.12 series pickle compatibility. """ -import sys -import pandas # noqa import copy import pickle as pkl -from pandas import compat, Index -from pandas.compat import u, string_types # noqa +import sys + +from pandas.compat import string_types, u # noqa + +import pandas # noqa +from pandas import Index, compat def load_reduce(self): @@ -56,8 +58,21 @@ def load_reduce(self): # If classes are moved, provide compat here. _class_locations_map = { + ('pandas.core.sparse.array', 'SparseArray'): + ('pandas.core.arrays', 'SparseArray'), # 15477 + # + # TODO: When FrozenNDArray is removed, add + # the following lines for compat: + # + # ('pandas.core.base', 'FrozenNDArray'): + # ('numpy', 'ndarray'), + # ('pandas.core.indexes.frozen', 'FrozenNDArray'): + # ('numpy', 'ndarray'), + # + # Afterwards, remove the current entry + # for `pandas.core.base.FrozenNDArray`. ('pandas.core.base', 'FrozenNDArray'): ('pandas.core.indexes.frozen', 'FrozenNDArray'), ('pandas.core.base', 'FrozenList'): @@ -88,7 +103,7 @@ def load_reduce(self): # 15998 top-level dirs moving ('pandas.sparse.array', 'SparseArray'): - ('pandas.core.sparse.array', 'SparseArray'), + ('pandas.core.arrays.sparse', 'SparseArray'), ('pandas.sparse.series', 'SparseSeries'): ('pandas.core.sparse.series', 'SparseSeries'), ('pandas.sparse.frame', 'SparseDataFrame'): @@ -195,10 +210,10 @@ def load(fh, encoding=None, compat=False, is_verbose=False): Parameters ---------- - fh: a filelike object - encoding: an optional encoding - compat: provide Series compatibility mode, boolean, default False - is_verbose: show exception output + fh : a filelike object + encoding : an optional encoding + compat : provide Series compatibility mode, boolean, default False + is_verbose : show exception output """ try: diff --git a/pandas/computation/expressions.py b/pandas/computation/expressions.py deleted file mode 100644 index d194cd2404c9d..0000000000000 --- a/pandas/computation/expressions.py +++ /dev/null @@ -1,15 +0,0 @@ -import warnings - - -def set_use_numexpr(v=True): - """ - .. deprecated:: 0.20.0 - Use ``pandas.set_option('compute.use_numexpr', v)`` instead. - """ - warnings.warn("pandas.computation.expressions.set_use_numexpr is " - "deprecated and will be removed in a future version.\n" - "you can toggle usage of numexpr via " - "pandas.get_option('compute.use_numexpr')", - FutureWarning, stacklevel=2) - from pandas import set_option - set_option('compute.use_numexpr', v) diff --git a/pandas/conftest.py b/pandas/conftest.py index 621de3ffd4b12..20f97bdec1107 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,18 +1,29 @@ -import os +from datetime import date, time, timedelta +from decimal import Decimal import importlib +import os +from dateutil.tz import tzlocal, tzutc +import hypothesis +from hypothesis import strategies as st +import numpy as np import pytest +from pytz import FixedOffset, utc -import pandas -import numpy as np -import pandas as pd -from pandas.compat import PY3 +from pandas.compat import PY3, u import pandas.util._test_decorators as td -import hypothesis +import pandas as pd hypothesis.settings.register_profile( "ci", + # Hypothesis timing checks are tuned for scalars by default, so we bump + # them from 200ms to 500ms per test case as the global default. If this + # is too short for a specific test, (a) try to make it faster, and (b) + # if it really is slow add `@settings(deadline=...)` with a working value, + # or `deadline=None` to entirely disable timeouts for that test. + deadline=500, + timeout=hypothesis.unlimited, suppress_health_check=(hypothesis.HealthCheck.too_slow,) ) hypothesis.settings.load_profile("ci") @@ -131,6 +142,30 @@ def all_arithmetic_operators(request): return request.param +_all_numeric_reductions = ['sum', 'max', 'min', + 'mean', 'prod', 'std', 'var', 'median', + 'kurt', 'skew'] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names + """ + return request.param + + +_all_boolean_reductions = ['all', 'any'] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request): + """ + Fixture for boolean reduction names + """ + return request.param + + _cython_table = pd.core.base.SelectionMixin._cython_table.items() @@ -212,6 +247,20 @@ def datetime_tz_utc(): return timezone.utc +utc_objs = ['utc', 'dateutil/UTC', utc, tzutc()] +if PY3: + from datetime import timezone + utc_objs.append(timezone.utc) + + +@pytest.fixture(params=utc_objs) +def utc_fixture(request): + """ + Fixture to provide variants of UTC timezone strings and tzinfo objects + """ + return request.param + + @pytest.fixture(params=['inner', 'outer', 'left', 'right']) def join_type(request): """ @@ -256,7 +305,7 @@ def deco(*args): @pytest.fixture def iris(datapath): """The iris dataset as a DataFrame.""" - return pandas.read_csv(datapath('data', 'iris.csv')) + return pd.read_csv(datapath('data', 'iris.csv')) @pytest.fixture(params=['nlargest', 'nsmallest']) @@ -275,6 +324,14 @@ def closed(request): return request.param +@pytest.fixture(params=['left', 'right', 'both', 'neither']) +def other_closed(request): + """ + Secondary closed fixture to allow parametrizing over all pairs of closed + """ + return request.param + + @pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) def nulls_fixture(request): """ @@ -299,7 +356,8 @@ def unique_nulls_fixture(request): TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific', - 'dateutil/Asia/Singapore'] + 'dateutil/Asia/Singapore', tzutc(), tzlocal(), FixedOffset(300), + FixedOffset(0), FixedOffset(-300)] @td.parametrize_fixture_doc(str(TIMEZONES)) @@ -328,8 +386,17 @@ def tz_aware_fixture(request): COMPLEX_DTYPES = [complex, "complex64", "complex128"] STRING_DTYPES = [str, 'str', 'U'] +DATETIME_DTYPES = ['datetime64[ns]', 'M8[ns]'] +TIMEDELTA_DTYPES = ['timedelta64[ns]', 'm8[ns]'] + +BOOL_DTYPES = [bool, 'bool'] +BYTES_DTYPES = [bytes, 'bytes'] +OBJECT_DTYPES = [object, 'object'] + ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES -ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES +ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES + + DATETIME_DTYPES + TIMEDELTA_DTYPES + BOOL_DTYPES + + OBJECT_DTYPES + BYTES_DTYPES * PY3) # bytes only for PY3 @pytest.fixture(params=STRING_DTYPES) @@ -348,8 +415,9 @@ def float_dtype(request): """ Parameterized fixture for float dtypes. - * float32 - * float64 + * float + * 'float32' + * 'float64' """ return request.param @@ -360,8 +428,9 @@ def complex_dtype(request): """ Parameterized fixture for complex dtypes. - * complex64 - * complex128 + * complex + * 'complex64' + * 'complex128' """ return request.param @@ -372,10 +441,11 @@ def sint_dtype(request): """ Parameterized fixture for signed integer dtypes. - * int8 - * int16 - * int32 - * int64 + * int + * 'int8' + * 'int16' + * 'int32' + * 'int64' """ return request.param @@ -386,10 +456,10 @@ def uint_dtype(request): """ Parameterized fixture for unsigned integer dtypes. - * uint8 - * uint16 - * uint32 - * uint64 + * 'uint8' + * 'uint16' + * 'uint32' + * 'uint64' """ return request.param @@ -398,16 +468,17 @@ def uint_dtype(request): @pytest.fixture(params=ALL_INT_DTYPES) def any_int_dtype(request): """ - Parameterized fixture for any integer dtypes. + Parameterized fixture for any integer dtype. - * int8 - * uint8 - * int16 - * uint16 - * int32 - * uint32 - * int64 - * uint64 + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' """ return request.param @@ -416,18 +487,20 @@ def any_int_dtype(request): @pytest.fixture(params=ALL_REAL_DTYPES) def any_real_dtype(request): """ - Parameterized fixture for any (purely) real numeric dtypes. + Parameterized fixture for any (purely) real numeric dtype. - * int8 - * uint8 - * int16 - * uint16 - * int32 - * uint32 - * int64 - * uint64 - * float32 - * float64 + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' """ return request.param @@ -438,26 +511,117 @@ def any_numpy_dtype(request): """ Parameterized fixture for all numpy dtypes. - * int8 - * uint8 - * int16 - * uint16 - * int32 - * uint32 - * int64 - * uint64 - * float32 - * float64 - * complex64 - * complex128 + * bool + * 'bool' + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' + * complex + * 'complex64' + * 'complex128' * str * 'str' * 'U' + * bytes + * 'bytes' + * 'datetime64[ns]' + * 'M8[ns]' + * 'timedelta64[ns]' + * 'm8[ns]' + * object + * 'object' """ return request.param +# categoricals are handled separately +_any_skipna_inferred_dtype = [ + ('string', ['a', np.nan, 'c']), + ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), + ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), + ('empty', [np.nan, np.nan, np.nan]), + ('empty', []), + ('mixed-integer', ['a', np.nan, 2]), + ('mixed', ['a', np.nan, 2.0]), + ('floating', [1.0, np.nan, 2.0]), + ('integer', [1, np.nan, 2]), + ('mixed-integer-float', [1, np.nan, 2.0]), + ('decimal', [Decimal(1), np.nan, Decimal(2)]), + ('boolean', [True, np.nan, False]), + ('datetime64', [np.datetime64('2013-01-01'), np.nan, + np.datetime64('2018-01-01')]), + ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]), + ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + # The following two dtypes are commented out due to GH 23554 + # ('complex', [1 + 1j, np.nan, 2 + 2j]), + # ('timedelta64', [np.timedelta64(1, 'D'), + # np.nan, np.timedelta64(2, 'D')]), + ('timedelta', [timedelta(1), np.nan, timedelta(2)]), + ('time', [time(1), np.nan, time(2)]), + ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), + ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])] +ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id + + +@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) +def any_skipna_inferred_dtype(request): + """ + Fixture for all inferred dtypes from _libs.lib.infer_dtype + + The covered (inferred) types are: + * 'string' + * 'unicode' (if PY2) + * 'empty' + * 'bytes' (if PY3) + * 'mixed' + * 'mixed-integer' + * 'mixed-integer-float' + * 'floating' + * 'integer' + * 'decimal' + * 'boolean' + * 'datetime64' + * 'datetime' + * 'date' + * 'timedelta' + * 'time' + * 'period' + * 'interval' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> import pandas._libs.lib as lib + >>> + >>> def test_something(any_skipna_inferred_dtype): + ... inferred_dtype, values = any_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + + @pytest.fixture def mock(): """ @@ -475,7 +639,6 @@ def mock(): # ---------------------------------------------------------------- # Global setup for tests using Hypothesis -from hypothesis import strategies as st # Registering these strategies makes them globally available via st.from_type, # which is use for offsets in tests/tseries/offsets/test_offsets_properties.py diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index eab529584d1fb..fa1dc751c17da 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -11,7 +11,7 @@ class DirNamesMixin(object): - _accessors = frozenset([]) + _accessors = frozenset() _deprecations = frozenset( ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides']) @@ -41,7 +41,9 @@ def __dir__(self): class PandasDelegate(object): - """ an abstract base class for delegating methods/properties """ + """ + an abstract base class for delegating methods/properties + """ def _delegate_property_get(self, name, *args, **kwargs): raise TypeError("You cannot access the " @@ -113,15 +115,18 @@ def delegate_names(delegate, accessors, typ, overwrite=False): Parameters ---------- - delegate : the class to get methods/properties & doc-strings - acccessors : string list of accessors to add - typ : 'property' or 'method' + delegate : object + the class to get methods/properties & doc-strings + acccessors : Sequence[str] + List of accessor to add + typ : {'property', 'method'} overwrite : boolean, default False overwrite the method/property in the target class if it exists Returns ------- - decorator + callable + A class decorator. Examples -------- @@ -143,7 +148,8 @@ def add_delegate_accessors(cls): # 2. We use a UserWarning instead of a custom Warning class CachedAccessor(object): - """Custom property-like object (descriptor) for caching accessors. + """ + Custom property-like object (descriptor) for caching accessors. Parameters ---------- @@ -186,7 +192,8 @@ def decorator(accessor): return decorator -_doc = """Register a custom accessor on %(klass)s objects. +_doc = """\ +Register a custom accessor on %(klass)s objects. Parameters ---------- @@ -201,7 +208,8 @@ def decorator(accessor): .. code-block:: python - def __init__(self, pandas_object): + def __init__(self, pandas_object): # noqa: E999 + ... For consistency with pandas methods, you should raise an ``AttributeError`` if the data passed to your accessor has an incorrect dtype. @@ -243,7 +251,7 @@ def plot(self): >>> ds.geo.plot() # plots data on a map -See also +See Also -------- %(others)s """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e91cc8ec1e996..1a4368ee8ea98 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,41 +3,31 @@ intended for public consumption """ from __future__ import division -from warnings import warn, catch_warnings, simplefilter + from textwrap import dedent +from warnings import catch_warnings, simplefilter, warn import numpy as np +from pandas._libs import algos, hashtable as htable, lib +from pandas._libs.tslib import iNaT +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg + from pandas.core.dtypes.cast import ( - maybe_promote, construct_1d_object_array_from_listlike) -from pandas.core.dtypes.generic import ( - ABCSeries, ABCIndex, - ABCIndexClass) + construct_1d_object_array_from_listlike, maybe_promote) from pandas.core.dtypes.common import ( - is_array_like, - is_unsigned_integer_dtype, is_signed_integer_dtype, - is_integer_dtype, is_complex_dtype, - is_object_dtype, - is_extension_array_dtype, - is_categorical_dtype, is_sparse, - is_period_dtype, - is_numeric_dtype, is_float_dtype, - is_bool_dtype, needs_i8_conversion, - is_datetimetz, - is_datetime64_any_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_datetimelike, - is_interval_dtype, is_scalar, is_list_like, - ensure_platform_int, ensure_object, - ensure_float64, ensure_uint64, - ensure_int64) -from pandas.compat.numpy import _np_version_under1p10 + ensure_float64, ensure_int64, ensure_object, ensure_platform_int, + ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, + is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, + is_datetimelike, is_extension_array_dtype, is_float_dtype, + is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, + is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, + is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, + needs_i8_conversion) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com -from pandas._libs import algos, lib, hashtable as htable -from pandas._libs.tslib import iNaT -from pandas.util._decorators import (Appender, Substitution, - deprecate_kwarg) _shared_docs = {} @@ -275,8 +265,8 @@ def match(to_match, values, na_sentinel=-1): # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas import Series - result = Series(result.ravel()).replace(-1, na_sentinel).values.\ - reshape(result.shape) + result = Series(result.ravel()).replace(-1, na_sentinel) + result = result.values.reshape(result.shape) return result @@ -353,7 +343,6 @@ def unique(values): -------- pandas.Index.unique pandas.Series.unique - """ values = _ensure_arraylike(values) @@ -388,8 +377,8 @@ def isin(comps, values): Parameters ---------- - comps: array-like - values: array-like + comps : array-like + values : array-like Returns ------- @@ -468,15 +457,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, ------- labels, uniques : ndarray """ - (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - uniques = vec_klass() - labels = table.get_labels(values, uniques, 0, na_sentinel, - na_value=na_value) + uniques, labels = table.factorize(values, na_sentinel=na_sentinel, + na_value=na_value) labels = ensure_platform_int(labels) - uniques = uniques.to_array() return labels, uniques @@ -512,8 +499,8 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, See Also -------- - pandas.cut : Discretize continuous-valued array. - pandas.unique : Find the unique value in an array. + cut : Discretize continuous-valued array. + unique : Find the unique value in an array. Examples -------- @@ -910,26 +897,12 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): ------ OverflowError if any x + y exceeds the maximum or minimum int64 value. """ - def _broadcast(arr_or_scalar, shape): - """ - Helper function to broadcast arrays / scalars to the desired shape. - """ - if _np_version_under1p10: - if is_scalar(arr_or_scalar): - out = np.empty(shape) - out.fill(arr_or_scalar) - else: - out = arr_or_scalar - else: - out = np.broadcast_to(arr_or_scalar, shape) - return out - # For performance reasons, we broadcast 'b' to the new array 'b2' # so that it has the same size as 'arr'. - b2 = _broadcast(b, arr.shape) + b2 = np.broadcast_to(b, arr.shape) if b_mask is not None: # We do the same broadcasting for b_mask as well. - b2_mask = _broadcast(b_mask, arr.shape) + b2_mask = np.broadcast_to(b_mask, arr.shape) else: b2_mask = None @@ -1178,7 +1151,7 @@ class SelectNFrame(SelectN): def __init__(self, obj, n, keep, columns): super(SelectNFrame, self).__init__(obj, n, keep) - if not is_list_like(columns): + if not is_list_like(columns) or isinstance(columns, tuple): columns = [columns] columns = list(columns) self.columns = columns @@ -1608,7 +1581,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # dispatch to internal type takes if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif is_datetimetz(arr): + elif is_datetime64tz_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) diff --git a/pandas/core/api.py b/pandas/core/api.py index 32df317a602a9..ad35b647ac458 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -14,7 +14,7 @@ MultiIndex, IntervalIndex, TimedeltaIndex, DatetimeIndex, PeriodIndex, NaT) -from pandas.core.indexes.period import Period, period_range, pnow +from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range from pandas.core.indexes.interval import Interval, interval_range @@ -36,27 +36,6 @@ describe_option, option_context, options) -# deprecation, xref #13790 -def match(*args, **kwargs): - - import warnings - warnings.warn("pd.match() is deprecated and will be removed " - "in a future version", - FutureWarning, stacklevel=2) - from pandas.core.algorithms import match - return match(*args, **kwargs) - - -def groupby(*args, **kwargs): - import warnings - - warnings.warn("pd.groupby() is deprecated and will be removed; " - "Please use the Series.groupby() or " - "DataFrame.groupby() methods", - FutureWarning, stacklevel=2) - return args[0].groupby(*args[1:], **kwargs) - - # Deprecation: xref gh-16747 class TimeGrouper(object): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 40cd952a62138..5658094ec36c6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,15 +1,15 @@ import warnings + import numpy as np -from pandas import compat + from pandas._libs import reduction -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.common import ( - is_extension_type, - is_dict_like, - is_list_like, - is_sequence) +import pandas.compat as compat from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.common import ( + is_dict_like, is_extension_type, is_list_like, is_sequence) +from pandas.core.dtypes.generic import ABCSeries + from pandas.io.formats.printing import pprint_thing @@ -132,7 +132,7 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): with np.errstate(all='ignore'): - results = self.f(self.values) + results = self.obj._data.apply('apply', func=self.f) return self.obj._constructor(data=results, index=self.index, columns=self.columns, copy=False) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 29f258bf1b29e..ea8837332633a 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -4,7 +4,8 @@ from .categorical import Categorical # noqa from .datetimes import DatetimeArrayMixin # noqa from .interval import IntervalArray # noqa -from .period import PeriodArrayMixin # noqa +from .period import PeriodArray, period_array # noqa from .timedeltas import TimedeltaArrayMixin # noqa from .integer import ( # noqa IntegerArray, integer_array) +from .sparse import SparseArray # noqa diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index efe587c6aaaad..9c6aa4a12923f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -5,21 +5,25 @@ This is an experimental API and subject to breaking changes without warning. """ -import numpy as np - import operator -from pandas.errors import AbstractMethodError +import numpy as np + +from pandas.compat import PY3, set_function_name from pandas.compat.numpy import function as nv -from pandas.compat import set_function_name, PY3 -from pandas.core import ops +from pandas.errors import AbstractMethodError + from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + +from pandas.core import ops _not_implemented_message = "{} does not implement {}." class ExtensionArray(object): - """Abstract base class for custom 1-D array types. + """ + Abstract base class for custom 1-D array types. pandas will recognize instances of this class as proper arrays with a custom type and will not attempt to coerce them to objects. They @@ -43,10 +47,12 @@ class ExtensionArray(object): * copy * _concat_same_type - An additional method is available to satisfy pandas' internal, - private block API. + A default repr displaying the type, (truncated) data, length, + and dtype is provided. It can be customized or replaced by + by overriding: - * _formatting_values + * __repr__ : A default repr for the ExtensionArray. + * _formatter : Print scalars inside a Series or DataFrame. Some methods require casting the ExtensionArray to an ndarray of Python objects with ``self.astype(object)``, which may be expensive. When @@ -63,6 +69,10 @@ class ExtensionArray(object): as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. + One can implement methods to handle array reductions. + + * _reduce + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is @@ -93,7 +103,8 @@ class ExtensionArray(object): # ------------------------------------------------------------------------ @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): - """Construct a new ExtensionArray from a sequence of scalars. + """ + Construct a new ExtensionArray from a sequence of scalars. Parameters ---------- @@ -105,6 +116,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): compatible with the ExtensionArray. copy : boolean, default False If True, copy the underlying data. + Returns ------- ExtensionArray @@ -113,7 +125,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_factorized(cls, values, original): - """Reconstruct an ExtensionArray after factorization. + """ + Reconstruct an ExtensionArray after factorization. Parameters ---------- @@ -135,7 +148,8 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): # type (Any) -> Any - """Select a subset of self. + """ + Select a subset of self. Parameters ---------- @@ -166,7 +180,8 @@ def __getitem__(self, item): def __setitem__(self, key, value): # type: (Union[int, np.ndarray], Any) -> None - """Set one or more values inplace. + """ + Set one or more values inplace. This method is not required to satisfy the pandas extension array interface. @@ -211,7 +226,8 @@ def __setitem__(self, key, value): def __len__(self): # type: () -> int - """Length of this array + """ + Length of this array Returns ------- @@ -220,8 +236,8 @@ def __len__(self): raise AbstractMethodError(self) def __iter__(self): - """Iterate over elements of the array. - + """ + Iterate over elements of the array. """ # This needs to be implemented so that pandas recognizes extension # arrays as list-like. The default implementation makes successive @@ -235,26 +251,32 @@ def __iter__(self): @property def dtype(self): # type: () -> ExtensionDtype - """An instance of 'ExtensionDtype'.""" + """ + An instance of 'ExtensionDtype'. + """ raise AbstractMethodError(self) @property def shape(self): # type: () -> Tuple[int, ...] - """Return a tuple of the array dimensions.""" + """ + Return a tuple of the array dimensions. + """ return (len(self),) @property def ndim(self): # type: () -> int - """Extension Arrays are only allowed to be 1-dimensional.""" + """ + Extension Arrays are only allowed to be 1-dimensional. + """ return 1 @property def nbytes(self): # type: () -> int - """The number of bytes needed to store this object in memory. - + """ + The number of bytes needed to store this object in memory. """ # If this is expensive to compute, return an approximate lower bound # on the number of bytes needed. @@ -264,7 +286,8 @@ def nbytes(self): # Additional Methods # ------------------------------------------------------------------------ def astype(self, dtype, copy=True): - """Cast to a NumPy array with 'dtype'. + """ + Cast to a NumPy array with 'dtype'. Parameters ---------- @@ -283,16 +306,32 @@ def astype(self, dtype, copy=True): return np.array(self, dtype=dtype, copy=copy) def isna(self): - # type: () -> np.ndarray - """Boolean NumPy array indicating if each value is missing. + # type: () -> Union[ExtensionArray, np.ndarray] + """ + A 1-D array indicating if each value is missing. + + Returns + ------- + na_values : Union[np.ndarray, ExtensionArray] + In most cases, this should return a NumPy ndarray. For + exceptional cases like ``SparseArray``, where returning + an ndarray would be expensive, an ExtensionArray may be + returned. + + Notes + ----- + If returning an ExtensionArray, then - This should return a 1-D array the same length as 'self'. + * ``na_values._is_boolean`` should be True + * `na_values` should implement :func:`ExtensionArray._reduce` + * ``na_values.any`` and ``na_values.all`` should be implemented """ raise AbstractMethodError(self) def _values_for_argsort(self): # type: () -> ndarray - """Return values for sorting. + """ + Return values for sorting. Returns ------- @@ -342,7 +381,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): return result def fillna(self, value=None, method=None, limit=None): - """ Fill NA/NaN values using the specified method. + """ + Fill NA/NaN values using the specified method. Parameters ---------- @@ -395,7 +435,8 @@ def fillna(self, value=None, method=None, limit=None): return new_values def dropna(self): - """ Return ExtensionArray without NA values + """ + Return ExtensionArray without NA values Returns ------- @@ -439,7 +480,8 @@ def shift(self, periods=1): return self._concat_same_type([a, b]) def unique(self): - """Compute the ExtensionArray of unique values. + """ + Compute the ExtensionArray of unique values. Returns ------- @@ -452,7 +494,8 @@ def unique(self): def _values_for_factorize(self): # type: () -> Tuple[ndarray, Any] - """Return an array and missing value suitable for factorization. + """ + Return an array and missing value suitable for factorization. Returns ------- @@ -466,12 +509,18 @@ def _values_for_factorize(self): as NA in the factorization routines, so it will be coded as `na_sentinal` and not included in `uniques`. By default, ``np.nan`` is used. + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. """ return self.astype(object), np.nan def factorize(self, na_sentinel=-1): # type: (int) -> Tuple[ndarray, ExtensionArray] - """Encode the extension array as an enumerated type. + """ + Encode the extension array as an enumerated type. Parameters ---------- @@ -524,7 +573,8 @@ def factorize(self, na_sentinel=-1): def take(self, indices, allow_fill=False, fill_value=None): # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray - """Take elements from an array. + """ + Take elements from an array. Parameters ---------- @@ -613,7 +663,8 @@ def take(self, indices, allow_fill=False, fill_value=None): def copy(self, deep=False): # type: (bool) -> ExtensionArray - """Return a copy of the array. + """ + Return a copy of the array. Parameters ---------- @@ -627,19 +678,75 @@ def copy(self, deep=False): raise AbstractMethodError(self) # ------------------------------------------------------------------------ - # Block-related methods + # Printing # ------------------------------------------------------------------------ + def __repr__(self): + from pandas.io.formats.printing import format_object_summary + + template = ( + u'{class_name}' + u'{data}\n' + u'Length: {length}, dtype: {dtype}' + ) + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = format_object_summary(self, self._formatter(), + indent_for_name=False).rstrip(', \n') + class_name = u'<{}>\n'.format(self.__class__.__name__) + return template.format(class_name=class_name, data=data, + length=len(self), + dtype=self.dtype) + + def _formatter(self, boxed=False): + # type: (bool) -> Callable[[Any], Optional[str]] + """Formatting function for scalar values. + + This is used in the default '__repr__'. The returned formatting + function receives instances of your scalar type. + + Parameters + ---------- + boxed: bool, default False + An indicated for whether or not your array is being printed + within a Series, DataFrame, or Index (True), or just by + itself (False). This may be useful if you want scalar values + to appear differently within a Series versus on its own (e.g. + quoted or not). + + Returns + ------- + Callable[[Any], str] + A callable that gets instances of the scalar type and + returns a string. By default, :func:`repr` is used + when ``boxed=False`` and :func:`str` is used when + ``boxed=True``. + """ + if boxed: + return str + return repr def _formatting_values(self): # type: () -> np.ndarray # At the moment, this has to be an array since we use result.dtype - """An array of values to be printed in, e.g. the Series repr""" + """ + An array of values to be printed in, e.g. the Series repr + + .. deprecated:: 0.24.0 + + Use :meth:`ExtensionArray._formatter` instead. + """ return np.array(self) + # ------------------------------------------------------------------------ + # Reshaping + # ------------------------------------------------------------------------ + @classmethod def _concat_same_type(cls, to_concat): # type: (Sequence[ExtensionArray]) -> ExtensionArray - """Concatenate multiple array + """ + Concatenate multiple array Parameters ---------- @@ -661,7 +768,8 @@ def _concat_same_type(cls, to_concat): @property def _ndarray_values(self): # type: () -> np.ndarray - """Internal pandas method for lossy conversion to a NumPy ndarray. + """ + Internal pandas method for lossy conversion to a NumPy ndarray. This method is not part of the pandas interface. @@ -670,10 +778,43 @@ def _ndarray_values(self): """ return np.array(self) + def _reduce(self, name, skipna=True, **kwargs): + """ + Return a scalar result of performing the reduction operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. + + Returns + ------- + scalar + + Raises + ------ + TypeError : subclass does not define reductions + """ + raise TypeError("cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype)) + class ExtensionOpsMixin(object): """ - A base class for linking the operators to their dunder names + A base class for linking the operators to their dunder names. + + .. note:: + + You may want to set ``__array_priority__`` if you want your + implementation to be called when involved in binary operations + with NumPy arrays. """ @classmethod @@ -710,12 +851,14 @@ def _add_comparison_ops(cls): class ExtensionScalarOpsMixin(ExtensionOpsMixin): - """A mixin for defining the arithmetic and logical operations on - an ExtensionArray class, where it is assumed that the underlying objects - have the operators already defined. + """ + A mixin for defining ops on an ExtensionArray. + + It is assumed that the underlying scalar objects have the operators + already defined. - Usage - ------ + Notes + ----- If you have defined a subclass MyExtensionArray(ExtensionArray), then use MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin) to get the arithmetic operators. After the definition of MyExtensionArray, @@ -725,6 +868,12 @@ class ExtensionScalarOpsMixin(ExtensionOpsMixin): MyExtensionArray._add_comparison_ops() to link the operators to your class. + + .. note:: + + You may want to set ``__array_priority__`` if you want your + implementation to be called when involved in binary operations + with NumPy arrays. """ @classmethod @@ -774,6 +923,11 @@ def convert_values(param): else: # Assume its an object ovalues = [param] * len(self) return ovalues + + if isinstance(other, (ABCSeries, ABCIndexClass)): + # rely on pandas to unbox and dispatch to us + return NotImplemented + lvalues = self rvalues = convert_values(other) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 216bccf7d6309..938ca53b5fdce 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,58 +1,46 @@ # pylint: disable=E1101,W0232 -import numpy as np -from warnings import warn import textwrap +from warnings import warn -from pandas import compat -from pandas.compat import u, lzip -from pandas._libs import lib, algos as libalgos +import numpy as np + +from pandas._libs import algos as libalgos, lib +import pandas.compat as compat +from pandas.compat import lzip, u +from pandas.compat.numpy import function as nv +from pandas.util._decorators import ( + Appender, Substitution, cache_readonly, deprecate_kwarg) +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.dtypes.generic import ( - ABCSeries, ABCIndexClass, ABCCategoricalIndex) -from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.cast import ( - maybe_infer_to_datetimelike, - coerce_indexer_dtype) -from pandas.core.dtypes.dtypes import CategoricalDtype + coerce_indexer_dtype, maybe_infer_to_datetimelike) from pandas.core.dtypes.common import ( - ensure_int64, - ensure_object, - ensure_platform_int, - is_extension_array_dtype, - is_dtype_equal, - is_datetimelike, - is_datetime64_dtype, - is_timedelta64_dtype, - is_categorical, - is_categorical_dtype, - is_float_dtype, - is_integer_dtype, - is_list_like, is_sequence, - is_scalar, is_iterator, - is_dict_like) - -from pandas.core.algorithms import factorize, take_1d, unique1d, take + ensure_int64, ensure_object, ensure_platform_int, is_categorical, + is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, + is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype, + is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence, + is_timedelta64_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, ABCIndexClass, ABCSeries) +from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.missing import isna, notna + from pandas.core.accessor import PandasDelegate, delegate_names -from pandas.core.base import (PandasObject, - NoNewAttributesMixin, _shared_docs) +import pandas.core.algorithms as algorithms +from pandas.core.algorithms import factorize, take, take_1d, unique1d +from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com +from pandas.core.config import get_option from pandas.core.missing import interpolate_2d -from pandas.compat.numpy import function as nv -from pandas.util._decorators import ( - Appender, cache_readonly, deprecate_kwarg, Substitution) - -import pandas.core.algorithms as algorithms +from pandas.core.sorting import nargsort from pandas.io.formats import console from pandas.io.formats.terminal import get_terminal_size -from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.config import get_option from .base import ExtensionArray - _take_msg = textwrap.dedent("""\ Interpreting negative values in 'indexer' as missing values. In the future, this will change to meaning positional indices @@ -110,7 +98,7 @@ def f(self, other): ret[na_mask] = False return ret - # Numpy-1.9 and earlier may convert a scalar to a zerodim array during + # Numpy < 1.13 may convert a scalar to a zerodim array during # comparison operation when second arg has higher priority, e.g. # # cat[0] < cat @@ -212,7 +200,8 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -_codes_doc = """The category codes of this categorical. +_codes_doc = """\ +The category codes of this categorical. Level codes are an array if integer which are the positions of the real values in the categories array. @@ -310,10 +299,10 @@ class Categorical(ExtensionArray, PandasObject): See the `user guide `_ for more. - See also + See Also -------- - pandas.api.types.CategoricalDtype : Type for categorical data - CategoricalIndex : An Index with an underlying ``Categorical`` + pandas.api.types.CategoricalDtype : Type for categorical data. + CategoricalIndex : An Index with an underlying ``Categorical``. """ # For comparisons, so that numpy uses our implementation if the compare @@ -340,7 +329,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # a.) use categories, ordered # b.) use values.dtype # c.) infer from values - if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, compat.string_types): @@ -360,6 +348,16 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # the "ordered" and "categories" arguments dtype = values.dtype._from_categorical_dtype(values.dtype, categories, ordered) + + # GH23814, for perf, if values._values already an instance of + # Categorical, set values to codes, and run fastpath + if (isinstance(values, (ABCSeries, ABCIndexClass)) and + isinstance(values._values, type(self))): + values = values._values.codes.copy() + if categories is None: + categories = dtype.categories + fastpath = True + else: # If dtype=None and values is not categorical, create a new dtype dtype = CategoricalDtype(categories, ordered) @@ -382,12 +380,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, dtype = CategoricalDtype(values.categories, dtype.ordered) elif not isinstance(values, (ABCIndexClass, ABCSeries)): - # _sanitize_array coerces np.nan to a string under certain versions + # sanitize_array coerces np.nan to a string under certain versions # of numpy values = maybe_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) - from pandas.core.series import _sanitize_array + from pandas.core.internals.construction import sanitize_array # By convention, empty lists result in object dtype: if len(values) == 0: sanitize_dtype = 'object' @@ -396,7 +394,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, null_mask = isna(values) if null_mask.any(): values = [values[idx] for idx in np.where(~null_mask)[0]] - values = _sanitize_array(values, None, dtype=sanitize_dtype) + values = sanitize_array(values, None, dtype=sanitize_dtype) if dtype.categories is None: try: @@ -438,7 +436,8 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, @property def categories(self): - """The categories of this categorical. + """ + The categories of this categorical. Setting assigns new values to each category (effectively a rename of each individual category). @@ -455,7 +454,7 @@ def categories(self): If the new categories do not validate as categories or if the number of new categories is unequal the number of old categories - See also + See Also -------- rename_categories reorder_categories @@ -477,12 +476,16 @@ def categories(self, categories): @property def ordered(self): - """Whether the categories have an ordered relationship""" + """ + Whether the categories have an ordered relationship + """ return self.dtype.ordered @property def dtype(self): - """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" + """ + The :class:`~pandas.api.types.CategoricalDtype` for this instance + """ return self._dtype @property @@ -497,8 +500,14 @@ def _constructor(self): def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) + def _formatter(self, boxed=False): + # Defer to CategoricalFormatter's formatter. + return None + def copy(self): - """ Copy constructor. """ + """ + Copy constructor. + """ return self._constructor(values=self._codes.copy(), dtype=self.dtype, fastpath=True) @@ -529,17 +538,23 @@ def astype(self, dtype, copy=True): @cache_readonly def ndim(self): - """Number of dimensions of the Categorical """ + """ + Number of dimensions of the Categorical + """ return self._codes.ndim @cache_readonly def size(self): - """ return the len of myself """ + """ + return the len of myself + """ return len(self) @cache_readonly def itemsize(self): - """ return the size of a single category """ + """ + return the size of a single category + """ return self.categories.itemsize def tolist(self): @@ -554,13 +569,16 @@ def tolist(self): @property def base(self): - """ compat, we are always our own object """ + """ + compat, we are always our own object + """ return None @classmethod def _from_inferred_categories(cls, inferred_categories, inferred_codes, - dtype): - """Construct a Categorical from inferred values + dtype, true_values=None): + """ + Construct a Categorical from inferred values. For inferred categories (`dtype` is None) the categories are sorted. For explicit `dtype`, the `inferred_categories` are cast to the @@ -568,10 +586,12 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, Parameters ---------- - inferred_categories : Index inferred_codes : Index dtype : CategoricalDtype or 'category' + true_values : list, optional + If none are provided, the default ones are + "True", "TRUE", and "true." Returns ------- @@ -580,27 +600,32 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, from pandas import Index, to_numeric, to_datetime, to_timedelta cats = Index(inferred_categories) - known_categories = (isinstance(dtype, CategoricalDtype) and dtype.categories is not None) if known_categories: - # Convert to a specialzed type with `dtype` if specified + # Convert to a specialized type with `dtype` if specified. if dtype.categories.is_numeric(): - cats = to_numeric(inferred_categories, errors='coerce') + cats = to_numeric(inferred_categories, errors="coerce") elif is_datetime64_dtype(dtype.categories): - cats = to_datetime(inferred_categories, errors='coerce') + cats = to_datetime(inferred_categories, errors="coerce") elif is_timedelta64_dtype(dtype.categories): - cats = to_timedelta(inferred_categories, errors='coerce') + cats = to_timedelta(inferred_categories, errors="coerce") + elif dtype.categories.is_boolean(): + if true_values is None: + true_values = ["True", "TRUE", "true"] + + cats = cats.isin(true_values) if known_categories: - # recode from observation order to dtype.categories order + # Recode from observation order to dtype.categories order. categories = dtype.categories codes = _recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: - # sort categories and recode for unknown categories + # Sort categories and recode for unknown categories. unsorted = cats.copy() categories = cats.sort_values() + codes = _recode_for_categories(inferred_codes, unsorted, categories) dtype = CategoricalDtype(categories, ordered=False) @@ -665,7 +690,8 @@ def from_codes(cls, codes, categories, ordered=False): _codes = None def _get_codes(self): - """ Get the codes. + """ + Get the codes. Returns ------- @@ -685,7 +711,8 @@ def _set_codes(self, codes): codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) def _set_categories(self, categories, fastpath=False): - """ Sets new categories inplace + """ + Sets new categories inplace Parameters ---------- @@ -718,7 +745,8 @@ def _set_categories(self, categories, fastpath=False): self._dtype = new_dtype def _set_dtype(self, dtype): - """Internal method for directly updating the CategoricalDtype + """ + Internal method for directly updating the CategoricalDtype Parameters ---------- @@ -780,7 +808,8 @@ def as_unordered(self, inplace=False): def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): - """ Sets the categories to the specified new_categories. + """ + Sets the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or remove old categories (which results in values @@ -821,7 +850,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, ------- cat : Categorical with reordered categories or None if inplace. - See also + See Also -------- rename_categories reorder_categories @@ -850,7 +879,8 @@ def set_categories(self, new_categories, ordered=None, rename=False, return cat def rename_categories(self, new_categories, inplace=False): - """ Renames categories. + """ + Renames categories. Raises ------ @@ -892,7 +922,7 @@ def rename_categories(self, new_categories, inplace=False): With ``inplace=False``, the new categorical is returned. With ``inplace=True``, there is no return value. - See also + See Also -------- reorder_categories add_categories @@ -943,7 +973,8 @@ def rename_categories(self, new_categories, inplace=False): return cat def reorder_categories(self, new_categories, ordered=None, inplace=False): - """ Reorders categories as specified in new_categories. + """ + Reorders categories as specified in new_categories. `new_categories` need to include all old categories and no new category items. @@ -969,7 +1000,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): ------- cat : Categorical with reordered categories or None if inplace. - See also + See Also -------- rename_categories add_categories @@ -985,7 +1016,8 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): inplace=inplace) def add_categories(self, new_categories, inplace=False): - """ Add new categories. + """ + Add new categories. `new_categories` will be included at the last/highest place in the categories and will be unused directly after this call. @@ -1008,7 +1040,7 @@ def add_categories(self, new_categories, inplace=False): ------- cat : Categorical with new categories added or None if inplace. - See also + See Also -------- rename_categories reorder_categories @@ -1034,7 +1066,8 @@ def add_categories(self, new_categories, inplace=False): return cat def remove_categories(self, removals, inplace=False): - """ Removes the specified categories. + """ + Removes the specified categories. `removals` must be included in the old categories. Values which were in the removed categories will be set to NaN @@ -1056,7 +1089,7 @@ def remove_categories(self, removals, inplace=False): ------- cat : Categorical with removed categories or None if inplace. - See also + See Also -------- rename_categories reorder_categories @@ -1086,7 +1119,8 @@ def remove_categories(self, removals, inplace=False): rename=False, inplace=inplace) def remove_unused_categories(self, inplace=False): - """ Removes categories which are not used. + """ + Removes categories which are not used. Parameters ---------- @@ -1098,7 +1132,7 @@ def remove_unused_categories(self, inplace=False): ------- cat : Categorical with unused categories dropped or None if inplace. - See also + See Also -------- rename_categories reorder_categories @@ -1209,7 +1243,8 @@ def map(self, mapper): # for Series/ndarray like compat @property def shape(self): - """ Shape of the Categorical. + """ + Shape of the Categorical. For internal compatibility with numpy arrays. @@ -1301,6 +1336,9 @@ def __setstate__(self, state): @property def T(self): + """ + Return transposed numpy array. + """ return self @property @@ -1342,15 +1380,13 @@ def searchsorted(self, value, side='left', sorter=None): "ordered one") from pandas.core.series import Series + codes = _get_codes_for_values(Series(value).values, self.categories) + if -1 in codes: + raise KeyError("Value(s) to be inserted must be in categories.") - values_as_codes = _get_codes_for_values(Series(value).values, - self.categories) - - if -1 in values_as_codes: - raise ValueError("Value(s) to be inserted must be in categories.") + codes = codes[0] if is_scalar(value) else codes - return self.codes.searchsorted(values_as_codes, side=side, - sorter=sorter) + return self.codes.searchsorted(codes, side=side, sorter=sorter) def isna(self): """ @@ -1362,11 +1398,11 @@ def isna(self): ------- a boolean array of whether my values are null - See also + See Also -------- - isna : top-level isna - isnull : alias of isna - Categorical.notna : boolean inverse of Categorical.isna + isna : Top-level isna. + isnull : Alias of isna. + Categorical.notna : Boolean inverse of Categorical.isna. """ @@ -1385,11 +1421,11 @@ def notna(self): ------- a boolean array of whether my values are not null - See also + See Also -------- - notna : top-level notna - notnull : alias of notna - Categorical.isna : boolean inverse of Categorical.notna + notna : Top-level notna. + notnull : Alias of notna. + Categorical.isna : Boolean inverse of Categorical.notna. """ return ~self.isna() @@ -1456,7 +1492,8 @@ def value_counts(self, dropna=True): return Series(count, index=CategoricalIndex(ix), dtype='int64') def get_values(self): - """ Return the values. + """ + Return the values. For internal compatibility with pandas formatting. @@ -1485,7 +1522,8 @@ def argsort(self, *args, **kwargs): # TODO(PY2): use correct signature # We have to do *args, **kwargs to avoid a a py2-only signature # issue since np.argsort differs from argsort. - """Return the indices that would sort the Categorical. + """ + Return the indices that would sort the Categorical. Parameters ---------- @@ -1501,7 +1539,7 @@ def argsort(self, *args, **kwargs): ------- argsorted : numpy array - See also + See Also -------- numpy.ndarray.argsort @@ -1528,7 +1566,8 @@ def argsort(self, *args, **kwargs): return super(Categorical, self).argsort(*args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): - """ Sorts the Categorical by category value returning a new + """ + Sorts the Categorical by category value returning a new Categorical by default. While an ordering is applied to the category values, sorting in this @@ -1605,32 +1644,15 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): msg = 'invalid na_position: {na_position!r}' raise ValueError(msg.format(na_position=na_position)) - codes = np.sort(self._codes) - if not ascending: - codes = codes[::-1] - - # NaN handling - na_mask = (codes == -1) - if na_mask.any(): - n_nans = len(codes[na_mask]) - if na_position == "first": - # in this case sort to the front - new_codes = codes.copy() - new_codes[0:n_nans] = -1 - new_codes[n_nans:] = codes[~na_mask] - codes = new_codes - elif na_position == "last": - # ... and to the end - new_codes = codes.copy() - pos = len(codes) - n_nans - new_codes[0:pos] = codes[~na_mask] - new_codes[pos:] = -1 - codes = new_codes + sorted_idx = nargsort(self, + ascending=ascending, + na_position=na_position) + if inplace: - self._codes = codes - return + self._codes = self._codes[sorted_idx] else: - return self._constructor(values=codes, dtype=self.dtype, + return self._constructor(values=self._codes[sorted_idx], + dtype=self.dtype, fastpath=True) def _values_for_rank(self): @@ -1663,7 +1685,8 @@ def _values_for_rank(self): return values def ravel(self, order='C'): - """ Return a flattened (numpy) array. + """ + Return a flattened (numpy) array. For internal compatibility with numpy arrays. @@ -1674,7 +1697,8 @@ def ravel(self, order='C'): return np.array(self) def view(self): - """Return a view of myself. + """ + Return a view of myself. For internal compatibility with numpy arrays. @@ -1686,7 +1710,8 @@ def view(self): return self def to_dense(self): - """Return my 'dense' representation + """ + Return my 'dense' representation For internal compatibility with numpy arrays. @@ -1698,7 +1723,8 @@ def to_dense(self): @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value') def fillna(self, value=None, method=None, limit=None): - """ Fill NA/NaN values using the specified method. + """ + Fill NA/NaN values using the specified method. Parameters ---------- @@ -1783,8 +1809,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): Parameters ---------- - indexer : sequence of integers - allow_fill : bool, default None. + indexer : sequence of int + The indices in `self` to take. The meaning of negative values in + `indexer` depends on the value of `allow_fill`. + allow_fill : bool, default None How to handle negative values in `indexer`. * False: negative values in `indices` indicate positional indices @@ -1801,11 +1829,52 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): default is ``True``. In the future, this will change to ``False``. + fill_value : object + The value to use for `indices` that are missing (-1), when + ``allow_fill=True``. This should be the category, i.e. a value + in ``self.categories``, not a code. + Returns ------- Categorical This Categorical will have the same categories and ordered as `self`. + + See Also + -------- + Series.take : Similar method for Series. + numpy.ndarray.take : Similar method for NumPy arrays. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'b']) + >>> cat + [a, a, b] + Categories (2, object): [a, b] + + Specify ``allow_fill==False`` to have negative indices mean indexing + from the right. + + >>> cat.take([0, -1, -2], allow_fill=False) + [a, b, a] + Categories (2, object): [a, b] + + With ``allow_fill=True``, indices equal to ``-1`` mean "missing" + values that should be filled with the `fill_value`, which is + ``np.nan`` by default. + + >>> cat.take([0, -1, -1], allow_fill=True) + [a, NaN, NaN] + Categories (2, object): [a, b] + + The fill value can be specified. + + >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') + [a, a, a] + Categories (3, object): [a, b] + + Specifying a fill value that's not in ``self.categories`` + will raise a ``TypeError``. """ indexer = np.asarray(indexer, dtype=np.intp) if allow_fill is None: @@ -1813,20 +1882,33 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): warn(_take_msg, FutureWarning, stacklevel=2) allow_fill = True + dtype = self.dtype + if isna(fill_value): - # For categorical, any NA value is considered a user-facing - # NA value. Our storage NA value is -1. fill_value = -1 + elif allow_fill: + # convert user-provided `fill_value` to codes + if fill_value in self.categories: + fill_value = self.categories.get_loc(fill_value) + else: + msg = ( + "'fill_value' ('{}') is not in this Categorical's " + "categories." + ) + raise TypeError(msg.format(fill_value)) codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = self._constructor(codes, dtype=self.dtype, fastpath=True) + result = type(self).from_codes(codes, + categories=dtype.categories, + ordered=dtype.ordered) return result take = take_nd def _slice(self, slicer): - """ Return a slice of myself. + """ + Return a slice of myself. For internal compatibility with numpy arrays. """ @@ -1843,15 +1925,21 @@ def _slice(self, slicer): return self._constructor(values=codes, dtype=self.dtype, fastpath=True) def __len__(self): - """The length of this Categorical.""" + """ + The length of this Categorical. + """ return len(self._codes) def __iter__(self): - """Returns an Iterator over the values of this Categorical.""" + """ + Returns an Iterator over the values of this Categorical. + """ return iter(self.get_values().tolist()) def __contains__(self, key): - """Returns True if `key` is in this Categorical.""" + """ + Returns True if `key` is in this Categorical. + """ # if key is a NaN, check if any NaN is in self. if isna(key): return self.isna().any() @@ -1874,7 +1962,9 @@ def _tidy_repr(self, max_vals=10, footer=True): return compat.text_type(result) def _repr_categories(self): - """ return the base repr for the categories """ + """ + return the base repr for the categories + """ max_categories = (10 if get_option("display.max_categories") == 0 else get_option("display.max_categories")) from pandas.io.formats import format as fmt @@ -1891,7 +1981,9 @@ def _repr_categories(self): return category_strs def _repr_categories_info(self): - """ Returns a string representation of the footer.""" + """ + Returns a string representation of the footer. + """ category_strs = self._repr_categories() dtype = getattr(self.categories, 'dtype_str', @@ -1934,7 +2026,9 @@ def _get_repr(self, length=True, na_rep='NaN', footer=True): return compat.text_type(result) def __unicode__(self): - """ Unicode representation. """ + """ + Unicode representation. + """ _maxlen = 10 if len(self._codes) > _maxlen: result = self._tidy_repr(_maxlen) @@ -1946,14 +2040,22 @@ def __unicode__(self): return result + def __repr__(self): + # We want PandasObject.__repr__, which dispatches to __unicode__ + return super(ExtensionArray, self).__repr__() + def _maybe_coerce_indexer(self, indexer): - """ return an indexer coerced to the codes dtype """ + """ + return an indexer coerced to the codes dtype + """ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': indexer = indexer.astype(self._codes.dtype) return indexer def __getitem__(self, key): - """ Return an item. """ + """ + Return an item. + """ if isinstance(key, (int, np.integer)): i = self._codes[key] if i == -1: @@ -1965,7 +2067,8 @@ def __getitem__(self, key): dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): - """ Item assignment. + """ + Item assignment. Raises @@ -2015,15 +2118,7 @@ def __setitem__(self, key, value): elif isinstance(key, slice): pass - # Array of True/False in Series or Categorical - else: - # There is a bug in numpy, which does not accept a Series as a - # indexer - # https://github.com/pandas-dev/pandas/issues/6168 - # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 - # FIXME: remove when numpy 1.9 is the lowest numpy version pandas - # accepts... - key = np.asarray(key) + # else: array of True/False in Series or Categorical lindexer = self.categories.get_indexer(rvalue) lindexer = self._maybe_coerce_indexer(lindexer) @@ -2069,17 +2164,16 @@ def _reverse_indexer(self): return result # reduction ops # - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - """ perform the reduction type operation """ + def _reduce(self, name, axis=0, skipna=True, **kwargs): func = getattr(self, name, None) if func is None: msg = 'Categorical cannot perform the operation {op}' raise TypeError(msg.format(op=name)) - return func(numeric_only=numeric_only, **kwds) + return func(**kwargs) def min(self, numeric_only=None, **kwargs): - """ The minimum value of the object. + """ + The minimum value of the object. Only ordered `Categoricals` have a minimum! @@ -2104,7 +2198,8 @@ def min(self, numeric_only=None, **kwargs): return self.categories[pointer] def max(self, numeric_only=None, **kwargs): - """ The maximum value of the object. + """ + The maximum value of the object. Only ordered `Categoricals` have a maximum! @@ -2263,7 +2358,8 @@ def is_dtype_equal(self, other): return False def describe(self): - """ Describes this Categorical + """ + Describes this Categorical Returns ------- @@ -2284,7 +2380,7 @@ def repeat(self, repeats, *args, **kwargs): """ Repeat elements of a Categorical. - See also + See Also -------- numpy.ndarray.repeat @@ -2304,9 +2400,6 @@ def _concat_same_type(self, to_concat): return _concat_categorical(to_concat) - def _formatting_values(self): - return self - def isin(self, values): """ Check whether `values` are contained in Categorical. @@ -2333,7 +2426,7 @@ def isin(self, values): See Also -------- - pandas.Series.isin : equivalent method on Series + pandas.Series.isin : Equivalent method on Series. Examples -------- @@ -2349,12 +2442,12 @@ def isin(self, values): >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ - from pandas.core.series import _sanitize_array + from pandas.core.internals.construction import sanitize_array if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a [{values_type}]" .format(values_type=type(values).__name__)) - values = _sanitize_array(values, None, None) + values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) code_values = code_values[null_mask | (code_values >= 0)] @@ -2397,7 +2490,6 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): >>> s.cat.set_categories(list('abcde')) >>> s.cat.as_ordered() >>> s.cat.as_unordered() - """ def __init__(self, data): @@ -2421,6 +2513,9 @@ def _delegate_property_set(self, name, new_values): @property def codes(self): + """ + Return Series of codes as well as the index. + """ from pandas import Series return Series(self._parent.codes, index=self.index) @@ -2440,11 +2535,26 @@ def _get_codes_for_values(values, categories): utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables - if is_dtype_equal(values.dtype, categories.dtype): + dtype_equal = is_dtype_equal(values.dtype, categories.dtype) + + if dtype_equal: # To prevent erroneous dtype coercion in _get_data_algo, retrieve # the underlying numpy array. gh-22702 - values = getattr(values, 'values', values) - categories = getattr(categories, 'values', categories) + values = getattr(values, '_ndarray_values', values) + categories = getattr(categories, '_ndarray_values', categories) + elif (is_extension_array_dtype(categories.dtype) and + is_object_dtype(values)): + # Support inferring the correct extension dtype from an array of + # scalar objects. e.g. + # Categorical(array[Period, Period], categories=PeriodIndex(...)) + try: + values = ( + categories.dtype.construct_array_type()._from_sequence(values) + ) + except Exception: + # but that may fail for any reason, so fall back to object + values = ensure_object(values) + categories = ensure_object(categories) else: values = ensure_object(values) categories = ensure_object(categories) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1ce60510c6a69..45eec41e498d1 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -5,43 +5,35 @@ import numpy as np -from pandas._libs import lib, iNaT, NaT -from pandas._libs.tslibs import timezones -from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds, Timedelta +from pandas._libs import NaT, iNaT, lib from pandas._libs.tslibs.period import ( - Period, DIFFERENT_FREQ_INDEX, IncompatibleFrequency) - -from pandas.errors import NullFrequencyError, PerformanceWarning -from pandas import compat - -from pandas.tseries import frequencies -from pandas.tseries.offsets import Tick, DateOffset + DIFFERENT_FREQ_INDEX, IncompatibleFrequency, Period) +from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds +from pandas._libs.tslibs.timestamps import ( + RoundTo, maybe_integer_op_deprecated, round_nsint64) +import pandas.compat as compat +from pandas.errors import ( + AbstractMethodError, NullFrequencyError, PerformanceWarning) +from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( - needs_i8_conversion, - is_list_like, - is_offsetlike, - is_extension_array_dtype, - is_datetime64_dtype, - is_datetime64_any_dtype, - is_datetime64tz_dtype, - is_float_dtype, - is_integer_dtype, - is_bool_dtype, - is_period_dtype, - is_timedelta64_dtype, - is_object_dtype) -from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame, ABCIndexClass -from pandas.core.dtypes.dtypes import DatetimeTZDtype - + is_bool_dtype, is_datetime64_any_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_extension_array_dtype, is_float_dtype, + is_integer_dtype, is_list_like, is_object_dtype, is_offsetlike, + is_period_dtype, is_timedelta64_dtype, needs_i8_conversion) +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core.algorithms import checked_add_with_arr, take, unique1d import pandas.core.common as com -from pandas.core.algorithms import checked_add_with_arr + +from pandas.tseries import frequencies +from pandas.tseries.offsets import DateOffset, Tick from .base import ExtensionOpsMixin -from pandas.util._decorators import deprecate_kwarg -def _make_comparison_op(op, cls): +def _make_comparison_op(cls, op): # TODO: share code with indexes.base version? Main difference is that # the block for MultiIndex was removed here. def cmp_method(self, other): @@ -62,7 +54,7 @@ def cmp_method(self, other): with warnings.catch_warnings(record=True): warnings.filterwarnings("ignore", "elementwise", FutureWarning) with np.errstate(all='ignore'): - result = op(self.values, np.asarray(other)) + result = op(self._data, np.asarray(other)) return result @@ -76,29 +68,200 @@ class AttributesMixin(object): @property def _attributes(self): # Inheriting subclass should implement _attributes as a list of strings - from pandas.errors import AbstractMethodError raise AbstractMethodError(self) @classmethod def _simple_new(cls, values, **kwargs): - from pandas.errors import AbstractMethodError raise AbstractMethodError(cls) def _get_attributes_dict(self): - """return an attributes dict for my class""" + """ + return an attributes dict for my class + """ return {k: getattr(self, k, None) for k in self._attributes} - def _shallow_copy(self, values=None, **kwargs): - if values is None: - # Note: slightly different from Index implementation which defaults - # to self.values - values = self._ndarray_values - attributes = self._get_attributes_dict() - attributes.update(kwargs) - if not len(values) and 'dtype' not in kwargs: - attributes['dtype'] = self.dtype - return self._simple_new(values, **attributes) +class DatelikeOps(object): + """ + Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. + """ + + def strftime(self, date_format): + from pandas import Index + return Index(self.format(date_format=date_format), + dtype=compat.text_type) + strftime.__doc__ = """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format doc <{0}>`__ + + Parameters + ---------- + date_format : str + Date format string (e.g. "%Y-%m-%d"). + + Returns + ------- + Index + Index of formatted strings + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%B %d, %Y, %r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """.format("https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior") + + +class TimelikeOps(object): + """ + Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. + """ + + _round_doc = ( + """ + Perform {op} operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times + + .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='T') + """) + + _round_example = ( + """>>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """) + + _floor_example = ( + """>>> rng.floor('H') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("H") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + ) + + _ceil_example = ( + """>>> rng.ceil('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + """ + ) + + def _round(self, freq, mode, ambiguous, nonexistent): + # round the local times + values = _ensure_datetimelike_to_i8(self) + result = round_nsint64(values, mode, freq) + result = self._maybe_mask_results(result, fill_value=NaT) + + attribs = self._get_attributes_dict() + attribs['freq'] = None + if 'tz' in attribs: + attribs['tz'] = None + return self._ensure_localized( + self._shallow_copy(result, **attribs), ambiguous, nonexistent + ) + + @Appender((_round_doc + _round_example).format(op="round")) + def round(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round( + freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent + ) + + @Appender((_round_doc + _floor_example).format(op="floor")) + def floor(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) + + @Appender((_round_doc + _ceil_example).format(op="ceil")) + def ceil(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin): @@ -118,7 +281,7 @@ def _box_func(self): """ box function to get object from internal representation """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _box_values(self, values): """ @@ -129,18 +292,17 @@ def _box_values(self, values): def __iter__(self): return (self._box_func(v) for v in self.asi8) - @property - def values(self): - """ return the underlying data as an ndarray """ - return self._data.view(np.ndarray) - @property def asi8(self): # do not cache or you'll create a memory leak - return self.values.view('i8') + return self._data.view('i8') - # ------------------------------------------------------------------ - # Array-like Methods + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + @property + def nbytes(self): + return self._data.nbytes @property def shape(self): @@ -171,7 +333,7 @@ def __getitem__(self, key): return self._box_func(val) if com.is_bool_indexer(key): - key = np.asarray(key) + key = np.asarray(key, dtype=bool) if key.all(): key = slice(0, None, None) else: @@ -207,24 +369,93 @@ def astype(self, dtype, copy=True): return self._box_values(self.asi8) return super(DatetimeLikeArrayMixin, self).astype(dtype, copy) + # ------------------------------------------------------------------ + # ExtensionArray Interface + # TODO: + # * _from_sequence + # * argsort / _values_for_argsort + # * _reduce + + def unique(self): + result = unique1d(self.asi8) + return type(self)(result, dtype=self.dtype) + + def _validate_fill_value(self, fill_value): + """ + If a fill_value is passed to `take` convert it to an i8 representation, + raising ValueError if this is not possible. + + Parameters + ---------- + fill_value : object + + Returns + ------- + fill_value : np.int64 + + Raises + ------ + ValueError + """ + raise AbstractMethodError(self) + + def take(self, indices, allow_fill=False, fill_value=None): + if allow_fill: + fill_value = self._validate_fill_value(fill_value) + + new_values = take(self.asi8, + indices, + allow_fill=allow_fill, + fill_value=fill_value) + + return type(self)(new_values, dtype=self.dtype) + + @classmethod + def _concat_same_type(cls, to_concat): + dtypes = {x.dtype for x in to_concat} + assert len(dtypes) == 1 + dtype = list(dtypes)[0] + + values = np.concatenate([x.asi8 for x in to_concat]) + return cls(values, dtype=dtype) + + def copy(self, deep=False): + values = self.asi8.copy() + return type(self)(values, dtype=self.dtype, freq=self.freq) + + def _values_for_factorize(self): + return self.asi8, iNaT + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, dtype=original.dtype) + # ------------------------------------------------------------------ # Null Handling + def isna(self): + return self._isnan + @property # NB: override with cache_readonly in immutable subclasses def _isnan(self): - """ return if each value is nan""" + """ + return if each value is nan + """ return (self.asi8 == iNaT) @property # NB: override with cache_readonly in immutable subclasses def hasnans(self): - """ return if I have any nans; enables various perf speedups """ - return self._isnan.any() + """ + return if I have any nans; enables various perf speedups + """ + return bool(self._isnan.any()) - def _maybe_mask_results(self, result, fill_value=None, convert=None): + def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): """ Parameters ---------- result : a ndarray + fill_value : object, default iNaT convert : string/dtype or None Returns @@ -245,33 +476,14 @@ def _maybe_mask_results(self, result, fill_value=None, convert=None): result[self._isnan] = fill_value return result - def _nat_new(self, box=True): - """ - Return Array/Index or ndarray filled with NaT which has the same - length as the caller. - - Parameters - ---------- - box : boolean, default True - - If True returns a Array/Index as the same as caller. - - If False returns ndarray of np.int64. - """ - result = np.zeros(len(self), dtype=np.int64) - result.fill(iNaT) - if not box: - return result - - attribs = self._get_attributes_dict() - if not is_period_dtype(self): - attribs['freq'] = None - return self._simple_new(result, **attribs) - # ------------------------------------------------------------------ # Frequency Properties/Methods @property def freq(self): - """Return the frequency object if it is set, otherwise None""" + """ + Return the frequency object if it is set, otherwise None. + """ return self._freq @freq.setter @@ -345,33 +557,72 @@ def _validate_frequency(cls, index, freq, **kwargs): # ------------------------------------------------------------------ # Arithmetic Methods - def _add_datelike(self, other): + def _add_datetimelike_scalar(self, other): + # Overriden by TimedeltaArray raise TypeError("cannot add {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) - def _sub_datelike(self, other): - raise com.AbstractMethodError(self) + _add_datetime_arraylike = _add_datetimelike_scalar + + def _sub_datetimelike_scalar(self, other): + # Overridden by DatetimeArray + assert other is not NaT + raise TypeError("cannot subtract a datelike from a {cls}" + .format(cls=type(self).__name__)) + + _sub_datetime_arraylike = _sub_datetimelike_scalar def _sub_period(self, other): - return NotImplemented + # Overriden by PeriodArray + raise TypeError("cannot subtract Period from a {cls}" + .format(cls=type(self).__name__)) def _add_offset(self, offset): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _add_delta(self, other): - return NotImplemented + """ + Add a timedelta-like, Tick or TimedeltaIndex-like object + to self, yielding an int64 numpy array + + Parameters + ---------- + delta : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} + + Returns + ------- + result : ndarray[int64] + + Notes + ----- + The result's name is set outside of _add_delta by the calling + method (__add__ or __sub__), if necessary (i.e. for Indexes). + """ + if isinstance(other, (Tick, timedelta, np.timedelta64)): + new_values = self._add_timedeltalike_scalar(other) + elif is_timedelta64_dtype(other): + # ndarray[timedelta64] or TimedeltaArray/index + new_values = self._add_delta_tdi(other) + + return new_values - def _add_delta_td(self, other): + def _add_timedeltalike_scalar(self, other): """ Add a delta of a timedeltalike return the i8 result view """ + if isna(other): + # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds + new_values = np.empty(len(self), dtype='i8') + new_values[:] = iNaT + return new_values + inc = delta_to_nanoseconds(other) new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view('i8') - if self.hasnans: - new_values[self._isnan] = iNaT + new_values = self._maybe_mask_results(new_values) return new_values.view('i8') def _add_delta_tdi(self, other): @@ -379,9 +630,14 @@ def _add_delta_tdi(self, other): Add a delta of a TimedeltaIndex return the i8 result view """ - if not len(self) == len(other): + if len(self) != len(other): raise ValueError("cannot add indices of unequal length") + if isinstance(other, np.ndarray): + # ndarray[timedelta64]; wrap in TimedeltaIndex for op + from pandas import TimedeltaIndex + other = TimedeltaIndex(other) + self_i8 = self.asi8 other_i8 = other.asi8 new_values = checked_add_with_arr(self_i8, other_i8, @@ -393,7 +649,9 @@ def _add_delta_tdi(self, other): return new_values.view('i8') def _add_nat(self): - """Add pd.NaT to self""" + """ + Add pd.NaT to self + """ if is_period_dtype(self): raise TypeError('Cannot add {cls} and {typ}' .format(cls=type(self).__name__, @@ -401,10 +659,16 @@ def _add_nat(self): # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes - return self._nat_new(box=True) + result = np.zeros(len(self), dtype=np.int64) + result.fill(iNaT) + if is_timedelta64_dtype(self): + return type(self)(result, freq=None) + return type(self)(result, tz=self.tz, freq=None) def _sub_nat(self): - """Subtract pd.NaT from self""" + """ + Subtract pd.NaT from self + """ # GH#19124 Timedelta - datetime is not in general well-defined. # We make an exception for pd.NaT, which in this case quacks # like a timedelta. @@ -431,11 +695,11 @@ def _sub_period_array(self, other): Array of DateOffset objects; nulls represented by NaT """ if not is_period_dtype(self): - raise TypeError("cannot subtract {dtype}-dtype to {cls}" + raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) - if not len(self) == len(other): + if len(self) != len(other): raise ValueError("cannot subtract arrays/indices of " "unequal length") if self.freq != other.freq: @@ -467,18 +731,11 @@ def _addsub_int_array(self, other, op): ------- result : same class as self """ + # _addsub_int_array is overriden by PeriodArray + assert not is_period_dtype(self) assert op in [operator.add, operator.sub] - if is_period_dtype(self): - # easy case for PeriodIndex - if op is operator.sub: - other = -other - res_values = checked_add_with_arr(self.asi8, other, - arr_mask=self._isnan) - res_values = res_values.view('i8') - res_values[self._isnan] = iNaT - return self._from_ordinals(res_values, freq=self.freq) - - elif self.freq is None: + + if self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") @@ -518,10 +775,9 @@ def _addsub_offset_array(self, other, op): left = lib.values_from_object(self.astype('O')) res_values = op(left, np.array(other)) - kwargs = {} if not is_period_dtype(self): - kwargs['freq'] = 'infer' - return type(self)(res_values, **kwargs) + return type(self)(res_values, freq='infer') + return self._from_sequence(res_values) @deprecate_kwarg(old_arg_name='n', new_arg_name='periods') def shift(self, periods, freq=None): @@ -552,6 +808,7 @@ def shift(self, periods, freq=None): See Also -------- Index.shift : Shift values of Index. + PeriodIndex.shift : Shift values of PeriodIndex. """ return self._time_shift(periods=periods, freq=freq) @@ -588,162 +845,177 @@ def _time_shift(self, periods, freq=None): start = self[0] + periods * self.freq end = self[-1] + periods * self.freq - attribs = self._get_attributes_dict() + + # Note: in the DatetimeTZ case, _generate_range will infer the + # appropriate timezone from `start` and `end`, so tz does not need + # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, - **attribs) + freq=self.freq) - @classmethod - def _add_datetimelike_methods(cls): - """ - add in the datetimelike methods (as we may have to override the - superclass) - """ - - def __add__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - - # scalar others - elif other is NaT: - result = self._add_nat() - elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(other) - elif isinstance(other, DateOffset): - # specifically _not_ a Tick - result = self._add_offset(other) - elif isinstance(other, (datetime, np.datetime64)): - result = self._add_datelike(other) - elif lib.is_integer(other): - # This check must come after the check for np.timedelta64 - # as is_integer returns True for these - result = self._time_shift(other) - - # array-like others - elif is_timedelta64_dtype(other): - # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.add) - elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): - # DatetimeIndex, ndarray[datetime64] - return self._add_datelike(other) - elif is_integer_dtype(other): - result = self._addsub_int_array(other, operator.add) - elif is_float_dtype(other) or is_period_dtype(other): - # Explicitly catch invalid dtypes - raise TypeError("cannot add {dtype}-dtype to {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) - elif is_extension_array_dtype(other): - # Categorical op will raise; defer explicitly - return NotImplemented - else: # pragma: no cover - return NotImplemented + def __add__(self, other): + other = lib.item_from_zerodim(other) + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented - return result + # scalar others + elif other is NaT: + result = self._add_nat() + elif isinstance(other, (Tick, timedelta, np.timedelta64)): + result = self._add_delta(other) + elif isinstance(other, DateOffset): + # specifically _not_ a Tick + result = self._add_offset(other) + elif isinstance(other, (datetime, np.datetime64)): + result = self._add_datetimelike_scalar(other) + elif lib.is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + maybe_integer_op_deprecated(self) + result = self._time_shift(other) + + # array-like others + elif is_timedelta64_dtype(other): + # TimedeltaIndex, ndarray[timedelta64] + result = self._add_delta(other) + elif is_offsetlike(other): + # Array/Index of DateOffset objects + result = self._addsub_offset_array(other, operator.add) + elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): + # DatetimeIndex, ndarray[datetime64] + return self._add_datetime_arraylike(other) + elif is_integer_dtype(other): + maybe_integer_op_deprecated(self) + result = self._addsub_int_array(other, operator.add) + elif is_float_dtype(other): + # Explicitly catch invalid dtypes + raise TypeError("cannot add {dtype}-dtype to {cls}" + .format(dtype=other.dtype, + cls=type(self).__name__)) + elif is_period_dtype(other): + # if self is a TimedeltaArray and other is a PeriodArray with + # a timedelta-like (i.e. Tick) freq, this operation is valid. + # Defer to the PeriodArray implementation. + # In remaining cases, this will end up raising TypeError. + return NotImplemented + elif is_extension_array_dtype(other): + # Categorical op will raise; defer explicitly + return NotImplemented + else: # pragma: no cover + return NotImplemented + + if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): + from pandas.core.arrays import TimedeltaArrayMixin + # TODO: infer freq? + return TimedeltaArrayMixin(result) + return result - cls.__add__ = __add__ - - def __radd__(self, other): - # alias for __add__ - return self.__add__(other) - cls.__radd__ = __radd__ - - def __sub__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - - # scalar others - elif other is NaT: - result = self._sub_nat() - elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(-other) - elif isinstance(other, DateOffset): - # specifically _not_ a Tick - result = self._add_offset(-other) - elif isinstance(other, (datetime, np.datetime64)): - result = self._sub_datelike(other) - elif lib.is_integer(other): - # This check must come after the check for np.timedelta64 - # as is_integer returns True for these - result = self._time_shift(-other) - elif isinstance(other, Period): - result = self._sub_period(other) - - # array-like others - elif is_timedelta64_dtype(other): - # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(-other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.sub) - elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): - # DatetimeIndex, ndarray[datetime64] - result = self._sub_datelike(other) - elif is_period_dtype(other): - # PeriodIndex - result = self._sub_period_array(other) - elif is_integer_dtype(other): - result = self._addsub_int_array(other, operator.sub) - elif isinstance(other, ABCIndexClass): - raise TypeError("cannot subtract {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) - elif is_float_dtype(other): - # Explicitly catch invalid dtypes - raise TypeError("cannot subtract {dtype}-dtype from {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) - elif is_extension_array_dtype(other): - # Categorical op will raise; defer explicitly - return NotImplemented - else: # pragma: no cover - return NotImplemented + def __radd__(self, other): + # alias for __add__ + return self.__add__(other) - return result + def __sub__(self, other): + other = lib.item_from_zerodim(other) + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented - cls.__sub__ = __sub__ - - def __rsub__(self, other): - if is_datetime64_dtype(other) and is_timedelta64_dtype(self): - # ndarray[datetime64] cannot be subtracted from self, so - # we need to wrap in DatetimeArray/Index and flip the operation - if not isinstance(other, DatetimeLikeArrayMixin): - # Avoid down-casting DatetimeIndex - from pandas.core.arrays import DatetimeArrayMixin - other = DatetimeArrayMixin(other) - return other - self - elif (is_datetime64_any_dtype(self) and hasattr(other, 'dtype') and - not is_datetime64_any_dtype(other)): - # GH#19959 datetime - datetime is well-defined as timedelta, - # but any other type - datetime is not well-defined. - raise TypeError("cannot subtract {cls} from {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) - return -(self - other) - cls.__rsub__ = __rsub__ - - def __iadd__(self, other): - # alias for __add__ - return self.__add__(other) - cls.__iadd__ = __iadd__ - - def __isub__(self, other): - # alias for __sub__ - return self.__sub__(other) - cls.__isub__ = __isub__ + # scalar others + elif other is NaT: + result = self._sub_nat() + elif isinstance(other, (Tick, timedelta, np.timedelta64)): + result = self._add_delta(-other) + elif isinstance(other, DateOffset): + # specifically _not_ a Tick + result = self._add_offset(-other) + elif isinstance(other, (datetime, np.datetime64)): + result = self._sub_datetimelike_scalar(other) + elif lib.is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + maybe_integer_op_deprecated(self) + result = self._time_shift(-other) + + elif isinstance(other, Period): + result = self._sub_period(other) + + # array-like others + elif is_timedelta64_dtype(other): + # TimedeltaIndex, ndarray[timedelta64] + result = self._add_delta(-other) + elif is_offsetlike(other): + # Array/Index of DateOffset objects + result = self._addsub_offset_array(other, operator.sub) + elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): + # DatetimeIndex, ndarray[datetime64] + result = self._sub_datetime_arraylike(other) + elif is_period_dtype(other): + # PeriodIndex + result = self._sub_period_array(other) + elif is_integer_dtype(other): + maybe_integer_op_deprecated(self) + result = self._addsub_int_array(other, operator.sub) + elif isinstance(other, ABCIndexClass): + raise TypeError("cannot subtract {cls} and {typ}" + .format(cls=type(self).__name__, + typ=type(other).__name__)) + elif is_float_dtype(other): + # Explicitly catch invalid dtypes + raise TypeError("cannot subtract {dtype}-dtype from {cls}" + .format(dtype=other.dtype, + cls=type(self).__name__)) + elif is_extension_array_dtype(other): + # Categorical op will raise; defer explicitly + return NotImplemented + else: # pragma: no cover + return NotImplemented + + if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): + from pandas.core.arrays import TimedeltaArrayMixin + # TODO: infer freq? + return TimedeltaArrayMixin(result) + return result + + def __rsub__(self, other): + if is_datetime64_dtype(other) and is_timedelta64_dtype(self): + # ndarray[datetime64] cannot be subtracted from self, so + # we need to wrap in DatetimeArray/Index and flip the operation + if not isinstance(other, DatetimeLikeArrayMixin): + # Avoid down-casting DatetimeIndex + from pandas.core.arrays import DatetimeArrayMixin + other = DatetimeArrayMixin(other) + return other - self + elif (is_datetime64_any_dtype(self) and hasattr(other, 'dtype') and + not is_datetime64_any_dtype(other)): + # GH#19959 datetime - datetime is well-defined as timedelta, + # but any other type - datetime is not well-defined. + raise TypeError("cannot subtract {cls} from {typ}" + .format(cls=type(self).__name__, + typ=type(other).__name__)) + elif is_period_dtype(self) and is_timedelta64_dtype(other): + # TODO: Can we simplify/generalize these cases at all? + raise TypeError("cannot subtract {cls} from {dtype}" + .format(cls=type(self).__name__, + dtype=other.dtype)) + return -(self - other) + + # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 + def __iadd__(self, other): + # alias for __add__ + return self.__add__(other) + + def __isub__(self, other): + # alias for __sub__ + return self.__sub__(other) # -------------------------------------------------------------- # Comparison Methods + # Called by _add_comparison_methods defined in ExtensionOpsMixin + _create_comparison_method = classmethod(_make_comparison_op) + def _evaluate_compare(self, other, op): """ We have been called because a comparison between - 8 aware arrays. numpy >= 1.11 will - now warn about NaT comparisons + 8 aware arrays. numpy will warn about NaT comparisons """ # Called by comparison methods when comparing datetimelike # with datetimelike @@ -772,21 +1044,8 @@ def _evaluate_compare(self, other, op): result[mask] = filler return result - # TODO: get this from ExtensionOpsMixin - @classmethod - def _add_comparison_methods(cls): - """ add in comparison methods """ - # DatetimeArray and TimedeltaArray comparison methods will - # call these as their super(...) methods - cls.__eq__ = _make_comparison_op(operator.eq, cls) - cls.__ne__ = _make_comparison_op(operator.ne, cls) - cls.__lt__ = _make_comparison_op(operator.lt, cls) - cls.__gt__ = _make_comparison_op(operator.gt, cls) - cls.__le__ = _make_comparison_op(operator.le, cls) - cls.__ge__ = _make_comparison_op(operator.ge, cls) - -DatetimeLikeArrayMixin._add_comparison_methods() +DatetimeLikeArrayMixin._add_comparison_ops() # ------------------------------------------------------------------- @@ -852,6 +1111,41 @@ def validate_endpoints(closed): return left_closed, right_closed +def validate_inferred_freq(freq, inferred_freq, freq_infer): + """ + If the user passes a freq and another freq is inferred from passed data, + require that they match. + + Parameters + ---------- + freq : DateOffset or None + inferred_freq : DateOffset or None + freq_infer : bool + + Returns + ------- + freq : DateOffset or None + freq_infer : bool + + Notes + ----- + We assume at this point that `maybe_infer_freq` has been called, so + `freq` is either a DateOffset object or None. + """ + if inferred_freq is not None: + if freq is not None and freq != inferred_freq: + raise ValueError('Inferred frequency {inferred} from passed ' + 'values does not conform to passed frequency ' + '{passed}' + .format(inferred=inferred_freq, + passed=freq.freqstr)) + elif freq is None: + freq = inferred_freq + freq_infer = False + + return freq, freq_infer + + def maybe_infer_freq(freq): """ Comparing a DateOffset to the string "infer" raises, so we need to @@ -879,34 +1173,37 @@ def maybe_infer_freq(freq): return freq, freq_infer -def validate_tz_from_dtype(dtype, tz): +def _ensure_datetimelike_to_i8(other, to_utc=False): """ - If the given dtype is a DatetimeTZDtype, extract the implied - tzinfo object from it and check that it does not conflict with the given - tz. + Helper for coercing an input scalar or array to i8. Parameters ---------- - dtype : dtype, str - tz : None, tzinfo + other : 1d array + to_utc : bool, default False + If True, convert the values to UTC before extracting the i8 values + If False, extract the i8 values directly. Returns ------- - tz : consensus tzinfo - - Raises - ------ - ValueError : on tzinfo mismatch + i8 1d array """ - if dtype is not None: + from pandas import Index + from pandas.core.arrays import PeriodArray + + if lib.is_scalar(other) and isna(other): + return iNaT + elif isinstance(other, (PeriodArray, ABCIndexClass)): + # convert tz if needed + if getattr(other, 'tz', None) is not None: + if to_utc: + other = other.tz_convert('UTC') + else: + other = other.tz_localize(None) + else: try: - dtype = DatetimeTZDtype.construct_from_string(dtype) - dtz = getattr(dtype, 'tz', None) - if dtz is not None: - if tz is not None and not timezones.tz_compare(tz, dtz): - raise ValueError("cannot supply both a tz and a dtype" - " with a tz") - tz = dtz + return np.array(other, copy=False).view('i8') except TypeError: - pass - return tz + # period array cannot be coerced to int + other = Index(other) + return other.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a0a9b57712249..a92e2f6157b40 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1,41 +1,33 @@ # -*- coding: utf-8 -*- -from datetime import datetime, timedelta, time +from datetime import datetime, time import warnings import numpy as np from pytz import utc from pandas._libs import lib, tslib -from pandas._libs.tslib import Timestamp, NaT, iNaT from pandas._libs.tslibs import ( - normalize_date, - conversion, fields, timezones, - resolution as libresolution) - -from pandas.util._decorators import cache_readonly -from pandas.errors import PerformanceWarning, AbstractMethodError -from pandas import compat + NaT, Timestamp, ccalendar, conversion, fields, iNaT, normalize_date, + resolution as libresolution, timezones) +import pandas.compat as compat +from pandas.errors import PerformanceWarning +from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - _NS_DTYPE, - is_object_dtype, - is_datetime64tz_dtype, - is_datetime64_dtype, - is_timedelta64_dtype, - ensure_int64) + _INT64_DTYPE, _NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype, + is_extension_type, is_float_dtype, is_int64_dtype, is_object_dtype, + is_period_dtype, is_string_dtype, is_timedelta64_dtype) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.missing import isna from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna -import pandas.core.common as com -from pandas.core.algorithms import checked_add_with_arr from pandas.core import ops - -from pandas.tseries.frequencies import to_offset -from pandas.tseries.offsets import Tick, generate_range - +from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl +import pandas.core.common as com +from pandas.tseries.frequencies import get_period_alias, to_offset +from pandas.tseries.offsets import Tick, generate_range _midnight = time(0, 0) @@ -46,7 +38,12 @@ def _to_m8(key, tz=None): """ if not isinstance(key, Timestamp): # this also converts strings - key = Timestamp(key, tz=tz) + key = Timestamp(key) + if key.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + key = key.tz_convert(tz) + else: + key = key.tz_localize(tz) return np.int64(conversion.pydt_to_i8(key)).view(_NS_DTYPE) @@ -54,9 +51,8 @@ def _to_m8(key, tz=None): def _field_accessor(name, field, docstring=None): def f(self): values = self.asi8 - if self.tz is not None: - if self.tz is not utc: - values = self._local_timestamps() + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() if field in self._bool_ops: if field.endswith(('start', 'end')): @@ -76,16 +72,17 @@ def f(self): if field in self._object_ops: result = fields.get_date_name_field(values, field) - result = self._maybe_mask_results(result) + result = self._maybe_mask_results(result, fill_value=None) else: result = fields.get_date_field(values, field) - result = self._maybe_mask_results(result, convert='float64') + result = self._maybe_mask_results(result, fill_value=None, + convert='float64') return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -110,35 +107,43 @@ def wrapper(self, other): # string that cannot be parsed to Timestamp return ops.invalid_comparison(self, other, op) - result = meth(self, other) + result = op(self.asi8, other.view('i8')) if isna(other): result.fill(nat_result) elif lib.is_scalar(other): return ops.invalid_comparison(self, other, op) else: if isinstance(other, list): - # FIXME: This can break for object-dtype with mixed types - other = type(self)(other) - elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): + try: + other = type(self)(other) + except ValueError: + other = np.array(other, dtype=np.object_) + elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries, + DatetimeArrayMixin)): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. return ops.invalid_comparison(self, other, op) if is_object_dtype(other): result = op(self.astype('O'), np.array(other)) + o_mask = isna(other) elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) return ops.invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) - result = meth(self, np.asarray(other)) + if not hasattr(other, 'asi8'): + # ndarray, Series + other = type(self)(other) + result = meth(self, other) + o_mask = other._isnan result = com.values_from_object(result) # Make sure to pass an array to result[...]; indexing with # Series breaks with older version of numpy - o_mask = np.array(isna(other)) + o_mask = np.array(o_mask) if o_mask.any(): result[o_mask] = nat_result @@ -150,13 +155,16 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin): +class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin, + dtl.TimelikeOps, + dtl.DatelikeOps): """ Assumes that subclass __new__/__init__ defines: tz _freq _data """ + _typ = "datetimearray" _bool_ops = ['is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end', 'is_leap_year'] @@ -166,27 +174,33 @@ class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin): # by returning NotImplemented timetuple = None + # Needed so that Timestamp.__richcmp__(DateTimeArray) operates pointwise + ndim = 1 + + # ensure that operations with numpy arrays defer to our implementation + __array_priority__ = 1000 + # ----------------------------------------------------------------- # Constructors _attributes = ["freq", "tz"] + _tz = None + _freq = None @classmethod - def _simple_new(cls, values, freq=None, tz=None, **kwargs): + def _simple_new(cls, values, freq=None, tz=None): """ we require the we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor """ + assert isinstance(values, np.ndarray), type(values) + if values.dtype == 'i8': + # for compat with datetime/timedelta/period shared methods, + # we can sometimes get here with int64 values. These represent + # nanosecond UTC (or tz-naive) unix timestamps + values = values.view('M8[ns]') - if getattr(values, 'dtype', None) is None: - # empty, but with dtype compat - if values is None: - values = np.empty(0, dtype=_NS_DTYPE) - return cls(values, freq=freq, tz=tz, **kwargs) - values = np.array(values, copy=False) - - if not is_datetime64_dtype(values): - values = ensure_int64(values).view(_NS_DTYPE) + assert values.dtype == 'M8[ns]', values.dtype result = object.__new__(cls) result._data = values @@ -195,29 +209,46 @@ def _simple_new(cls, values, freq=None, tz=None, **kwargs): result._tz = timezones.tz_standardize(tz) return result - def __new__(cls, values, freq=None, tz=None, dtype=None): - if tz is None and hasattr(values, 'tz'): - # e.g. DatetimeIndex - tz = values.tz + def __new__(cls, values, freq=None, tz=None, dtype=None, copy=False, + dayfirst=False, yearfirst=False, ambiguous='raise'): + return cls._from_sequence( + values, freq=freq, tz=tz, dtype=dtype, copy=copy, + dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous) + + @classmethod + def _from_sequence(cls, data, dtype=None, copy=False, + tz=None, freq=None, + dayfirst=False, yearfirst=False, ambiguous='raise'): freq, freq_infer = dtl.maybe_infer_freq(freq) - # if dtype has an embedded tz, capture it - tz = dtl.validate_tz_from_dtype(dtype, tz) + subarr, tz, inferred_freq = sequence_to_dt64ns( + data, dtype=dtype, copy=copy, tz=tz, + dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous) + + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, + freq_infer) - result = cls._simple_new(values, freq=freq, tz=tz) - if freq_infer: - inferred = result.inferred_freq - if inferred: - result.freq = to_offset(inferred) + result = cls._simple_new(subarr, freq=freq, tz=tz) + + if inferred_freq is None and freq is not None: + # this condition precludes `freq_infer` + cls._validate_frequency(result, freq, ambiguous=ambiguous) + + elif freq_infer: + result.freq = to_offset(result.inferred_freq) - # NB: Among other things not yet ported from the DatetimeIndex - # constructor, this does not call _deepcopy_if_needed return result @classmethod def _generate_range(cls, start, end, periods, freq, tz=None, normalize=False, ambiguous='raise', closed=None): + + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError('Must provide freq argument if no data is ' + 'supplied') + if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, ' 'and freq, exactly three must be specified') @@ -258,27 +289,22 @@ def _generate_range(cls, start, end, periods, freq, tz=None, end, end.tz, start.tz, freq, tz ) if freq is not None: - if cls._use_cached_range(freq, _normalized, start, end): - # Currently always False; never hit - # Should be reimplemented as apart of GH 17914 - index = cls._cached_range(start, end, periods=periods, - freq=freq) - else: - index = _generate_regular_range(cls, start, end, periods, freq) - - if tz is not None and getattr(index, 'tz', None) is None: - arr = conversion.tz_localize_to_utc( - ensure_int64(index.values), - tz, ambiguous=ambiguous) - - index = cls(arr) - - # index is localized datetime64 array -> have to convert - # start/end as well to compare - if start is not None: - start = start.tz_localize(tz).asm8 - if end is not None: - end = end.tz_localize(tz).asm8 + # TODO: consider re-implementing _cached_range; GH#17914 + index = _generate_regular_range(cls, start, end, periods, freq) + + if tz is not None and index.tz is None: + arr = conversion.tz_localize_to_utc( + index.asi8, + tz, ambiguous=ambiguous) + + index = cls(arr) + + # index is localized datetime64 array -> have to convert + # start/end as well to compare + if start is not None: + start = start.tz_localize(tz).asm8 + if end is not None: + end = end.tz_localize(tz).asm8 else: # Create a linearly spaced date_range in local time arr = np.linspace(start.value, end.value, periods) @@ -291,17 +317,7 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if not right_closed and len(index) and index[-1] == end: index = index[:-1] - return cls._simple_new(index.values, freq=freq, tz=tz) - - @classmethod - def _use_cached_range(cls, freq, _normalized, start, end): - # DatetimeArray is mutable, so is not cached - return False - - @classmethod - def _cached_range(cls, start=None, end=None, - periods=None, freq=None, **kwargs): - raise AbstractMethodError(cls) + return cls._simple_new(index.asi8, freq=freq, tz=tz) # ----------------------------------------------------------------- # Descriptive Properties @@ -310,7 +326,7 @@ def _cached_range(cls, start=None, end=None, def _box_func(self): return lambda x: Timestamp(x, freq=self.freq, tz=self.tz) - @cache_readonly + @property def dtype(self): if self.tz is None: return _NS_DTYPE @@ -318,6 +334,9 @@ def dtype(self): @property def tz(self): + """ + Return timezone. + """ # GH 18595 return self._tz @@ -336,12 +355,16 @@ def tzinfo(self): @property # NB: override with cache_readonly in immutable subclasses def _timezone(self): - """ Comparable timezone both for pytz / dateutil""" + """ + Comparable timezone both for pytz / dateutil + """ return timezones.get_timezone(self.tzinfo) @property def offset(self): - """get/set the frequency of the instance""" + """ + get/set the frequency of the instance + """ msg = ('{cls}.offset has been deprecated and will be removed ' 'in a future version; use {cls}.freq instead.' .format(cls=type(self).__name__)) @@ -350,7 +373,9 @@ def offset(self): @offset.setter def offset(self, value): - """get/set the frequency of the instance""" + """ + get/set the frequency of the instance + """ msg = ('{cls}.offset has been deprecated and will be removed ' 'in a future version; use {cls}.freq instead.' .format(cls=type(self).__name__)) @@ -369,7 +394,16 @@ def _resolution(self): return libresolution.resolution(self.asi8, self.tz) # ---------------------------------------------------------------- - # Array-like Methods + # Array-Like / EA-Interface Methods + + def __array__(self, dtype=None): + if is_object_dtype(dtype): + return np.array(list(self), dtype=object) + elif is_int64_dtype(dtype): + return self.asi8 + + # TODO: warn that conversion may be lossy? + return self._data.view(np.ndarray) # follow Index.__array__ def __iter__(self): """ @@ -394,6 +428,25 @@ def __iter__(self): for v in converted: yield v + # ---------------------------------------------------------------- + # ExtensionArray Interface + + @property + def _ndarray_values(self): + return self._data + + @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) + def _validate_fill_value(self, fill_value): + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, (datetime, np.datetime64)): + self._assert_tzawareness_compat(fill_value) + fill_value = Timestamp(fill_value).value + else: + raise ValueError("'fill_value' should be a Timestamp. " + "Got '{got}'.".format(got=fill_value)) + return fill_value + # ----------------------------------------------------------------- # Comparison Methods @@ -429,11 +482,21 @@ def _assert_tzawareness_compat(self, other): # ----------------------------------------------------------------- # Arithmetic Methods - def _sub_datelike_dti(self, other): - """subtraction of two DatetimeIndexes""" - if not len(self) == len(other): + def _sub_datetime_arraylike(self, other): + """subtract DatetimeArray/Index or ndarray[datetime64]""" + if len(self) != len(other): raise ValueError("cannot add indices of unequal length") + if isinstance(other, np.ndarray): + assert is_datetime64_dtype(other) + other = type(self)(other) + + if not self._has_same_tz(other): + # require tz compat + raise TypeError("{cls} subtraction must have the same " + "timezones or no timezones" + .format(cls=type(self).__name__)) + self_i8 = self.asi8 other_i8 = other.asi8 new_values = checked_add_with_arr(self_i8, -other_i8, @@ -461,74 +524,41 @@ def _add_offset(self, offset): return type(self)(result, freq='infer') - def _sub_datelike(self, other): + def _sub_datetimelike_scalar(self, other): # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] - if isinstance(other, (DatetimeArrayMixin, np.ndarray)): - if isinstance(other, np.ndarray): - # if other is an ndarray, we assume it is datetime64-dtype - other = type(self)(other) - if not self._has_same_tz(other): - # require tz compat - raise TypeError("{cls} subtraction must have the same " - "timezones or no timezones" - .format(cls=type(self).__name__)) - result = self._sub_datelike_dti(other) - elif isinstance(other, (datetime, np.datetime64)): - assert other is not NaT - other = Timestamp(other) - if other is NaT: - return self - NaT + assert isinstance(other, (datetime, np.datetime64)) + assert other is not NaT + other = Timestamp(other) + if other is NaT: + return self - NaT + + if not self._has_same_tz(other): # require tz compat - elif not self._has_same_tz(other): - raise TypeError("Timestamp subtraction must have the same " - "timezones or no timezones") - else: - i8 = self.asi8 - result = checked_add_with_arr(i8, -other.value, - arr_mask=self._isnan) - result = self._maybe_mask_results(result, - fill_value=iNaT) - else: - raise TypeError("cannot subtract {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError("Timestamp subtraction must have the same " + "timezones or no timezones") + + i8 = self.asi8 + result = checked_add_with_arr(i8, -other.value, + arr_mask=self._isnan) + result = self._maybe_mask_results(result) return result.view('timedelta64[ns]') def _add_delta(self, delta): """ - Add a timedelta-like, DateOffset, or TimedeltaIndex-like object - to self. + Add a timedelta-like, Tick, or TimedeltaIndex-like object + to self, yielding a new DatetimeArray Parameters ---------- - delta : {timedelta, np.timedelta64, DateOffset, - TimedelaIndex, ndarray[timedelta64]} + other : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} Returns ------- - result : same type as self - - Notes - ----- - The result's name is set outside of _add_delta by the calling - method (__add__ or __sub__) + result : DatetimeArray """ - from pandas.core.arrays.timedeltas import TimedeltaArrayMixin - - if isinstance(delta, (Tick, timedelta, np.timedelta64)): - new_values = self._add_delta_td(delta) - elif is_timedelta64_dtype(delta): - if not isinstance(delta, TimedeltaArrayMixin): - delta = TimedeltaArrayMixin(delta) - new_values = self._add_delta_tdi(delta) - else: - new_values = self.astype('O') + delta - - tz = 'UTC' if self.tz is not None else None - result = type(self)(new_values, tz=tz, freq='infer') - if self.tz is not None and self.tz is not utc: - result = result.tz_convert(self.tz) - return result + new_values = dtl.DatetimeLikeArrayMixin._add_delta(self, delta) + return type(self)(new_values, tz=self.tz, freq='infer') # ----------------------------------------------------------------- # Timezone Conversion and Localization Methods @@ -564,7 +594,7 @@ def tz_convert(self, tz): See Also -------- - DatetimeIndex.tz : A timezone that has a variable offset from UTC + DatetimeIndex.tz : A timezone that has a variable offset from UTC. DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a given time zone, or remove timezone from a tz-aware DatetimeIndex. @@ -614,9 +644,10 @@ def tz_convert(self, tz): 'tz_localize to localize') # No conversion since timestamps are all UTC to begin with - return self._shallow_copy(tz=tz) + return self._simple_new(self.asi8, tz=tz, freq=self.freq) - def tz_localize(self, tz, ambiguous='raise', errors='raise'): + def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', + errors=None): """ Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. @@ -632,8 +663,13 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): tz : string, pytz.timezone, dateutil.tz.tzfile or None Time zone to convert timestamps to. Passing ``None`` will remove the time zone information preserving local time. - ambiguous : str {'infer', 'NaT', 'raise'} or bool array, - default 'raise' + ambiguous : 'infer', 'NaT', bool array, default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. - 'infer' will attempt to infer fall dst-transition hours based on order @@ -644,15 +680,27 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): - 'raise' will raise an AmbiguousTimeError if there are ambiguous times - errors : {'raise', 'coerce'}, default 'raise' + nonexistent : 'shift', 'NaT' default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent times forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 + + errors : {'raise', 'coerce'}, default None - 'raise' will raise a NonExistentTimeError if a timestamp is not valid in the specified time zone (e.g. due to a transition from - or to DST time) + or to DST time). Use ``nonexistent='raise'`` instead. - 'coerce' will return NaT if the timestamp can not be converted - to the specified time zone + to the specified time zone. Use ``nonexistent='NaT'`` instead. - .. versionadded:: 0.19.0 + .. deprecated:: 0.24.0 Returns ------- @@ -693,21 +741,72 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', '2018-03-03 09:00:00'], dtype='datetime64[ns]', freq='D') + + Be careful with DST changes. When there is sequential data, pandas can + infer the DST time: + >>> s = pd.to_datetime(pd.Series([ + ... '2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) + >>> s.dt.tz_localize('CET', ambiguous='infer') + 2018-10-28 01:30:00+02:00 0 + 2018-10-28 02:00:00+02:00 1 + 2018-10-28 02:30:00+02:00 2 + 2018-10-28 02:00:00+01:00 3 + 2018-10-28 02:30:00+01:00 4 + 2018-10-28 03:00:00+01:00 5 + 2018-10-28 03:30:00+01:00 6 + dtype: int64 + + In some cases, inferring the DST is impossible. In such cases, you can + pass an ndarray to the ambiguous parameter to set the DST explicitly + + >>> s = pd.to_datetime(pd.Series([ + ... '2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) + >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False])) + 0 2018-10-28 01:20:00+02:00 + 1 2018-10-28 02:36:00+02:00 + 2 2018-10-28 03:46:00+01:00 + dtype: datetime64[ns, CET] """ + if errors is not None: + warnings.warn("The errors argument is deprecated and will be " + "removed in a future release. Use " + "nonexistent='NaT' or nonexistent='raise' " + "instead.", FutureWarning) + if errors == 'coerce': + nonexistent = 'NaT' + elif errors == 'raise': + nonexistent = 'raise' + else: + raise ValueError("The errors argument must be either 'coerce' " + "or 'raise'.") + + if nonexistent not in ('raise', 'NaT', 'shift'): + raise ValueError("The nonexistent argument must be one of 'raise'," + " 'NaT' or 'shift'") + if self.tz is not None: if tz is None: - new_dates = conversion.tz_convert(self.asi8, 'UTC', self.tz) + new_dates = conversion.tz_convert(self.asi8, timezones.UTC, + self.tz) else: raise TypeError("Already tz-aware, use tz_convert to convert.") else: tz = timezones.maybe_get_tz(tz) # Convert to UTC - new_dates = conversion.tz_localize_to_utc(self.asi8, tz, - ambiguous=ambiguous, - errors=errors) + new_dates = conversion.tz_localize_to_utc( + self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent, + ) new_dates = new_dates.view(_NS_DTYPE) - return self._shallow_copy(new_dates, tz=tz) + return self._simple_new(new_dates, tz=tz, freq=self.freq) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timestamp methods @@ -761,9 +860,97 @@ def normalize(self): '2014-08-01 00:00:00+05:30'], dtype='datetime64[ns, Asia/Calcutta]', freq=None) """ - new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) + if self.tz is None or timezones.is_utc(self.tz): + not_null = self.notna() + DAY_NS = ccalendar.DAY_SECONDS * 1000000000 + new_values = self.asi8.copy() + adjustment = (new_values[not_null] % DAY_NS) + new_values[not_null] = new_values[not_null] - adjustment + else: + new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) return type(self)(new_values, freq='infer').tz_localize(self.tz) + def to_period(self, freq=None): + """ + Cast to PeriodArray/Index at a particular frequency. + + Converts DatetimeArray/Index to PeriodArray/Index. + + Parameters + ---------- + freq : string or Offset, optional + One of pandas' :ref:`offset strings ` + or an Offset object. Will be inferred by default. + + Returns + ------- + PeriodArray/Index + + Raises + ------ + ValueError + When converting a DatetimeArray/Index with non-regular values, + so that a frequency cannot be inferred. + + Examples + -------- + >>> df = pd.DataFrame({"y": [1,2,3]}, + ... index=pd.to_datetime(["2000-03-31 00:00:00", + ... "2000-05-31 00:00:00", + ... "2000-08-31 00:00:00"])) + >>> df.index.to_period("M") + PeriodIndex(['2000-03', '2000-05', '2000-08'], + dtype='period[M]', freq='M') + + Infer the daily frequency + + >>> idx = pd.date_range("2017-01-01", periods=2) + >>> idx.to_period() + PeriodIndex(['2017-01-01', '2017-01-02'], + dtype='period[D]', freq='D') + + See Also + -------- + PeriodIndex: Immutable ndarray holding ordinal values. + DatetimeIndex.to_pydatetime: Return DatetimeIndex as object. + """ + from pandas.core.arrays import PeriodArray + + if self.tz is not None: + warnings.warn("Converting to PeriodArray/Index representation " + "will drop timezone information.", UserWarning) + + if freq is None: + freq = self.freqstr or self.inferred_freq + + if freq is None: + raise ValueError("You must pass a freq argument as " + "current index has none.") + + freq = get_period_alias(freq) + + return PeriodArray._from_datetime64(self._data, freq, tz=self.tz) + + def to_perioddelta(self, freq): + """ + Calculate TimedeltaArray of difference between index + values and index converted to PeriodArray at specified + freq. Used for vectorized offsets + + Parameters + ---------- + freq : Period frequency + + Returns + ------- + TimedeltaArray/Index + """ + # TODO: consider privatizing (discussion in GH#23113) + from pandas.core.arrays.timedeltas import TimedeltaArrayMixin + i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8 + m8delta = i8delta.view('m8[ns]') + return TimedeltaArrayMixin(m8delta) + # ----------------------------------------------------------------- # Properties - Vectorized Timestamp Properties/Methods @@ -793,14 +980,14 @@ def month_name(self, locale=None): >>> idx.month_name() Index(['January', 'February', 'March'], dtype='object') """ - if self.tz is not None and self.tz is not utc: + if self.tz is not None and not timezones.is_utc(self.tz): values = self._local_timestamps() else: values = self.asi8 result = fields.get_date_name_field(values, 'month_name', locale=locale) - result = self._maybe_mask_results(result) + result = self._maybe_mask_results(result, fill_value=None) return result def day_name(self, locale=None): @@ -829,14 +1016,14 @@ def day_name(self, locale=None): >>> idx.day_name() Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') """ - if self.tz is not None and self.tz is not utc: + if self.tz is not None and not timezones.is_utc(self.tz): values = self._local_timestamps() else: values = self.asi8 result = fields.get_date_name_field(values, 'day_name', locale=locale) - result = self._maybe_mask_results(result) + result = self._maybe_mask_results(result, fill_value=None) return result @property @@ -847,7 +1034,7 @@ def time(self): # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - if self.tz is not None and self.tz is not utc: + if self.tz is not None and not timezones.is_utc(self.tz): timestamps = self._local_timestamps() else: timestamps = self.asi8 @@ -871,26 +1058,26 @@ def date(self): # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - if self.tz is not None and self.tz is not utc: + if self.tz is not None and not timezones.is_utc(self.tz): timestamps = self._local_timestamps() else: timestamps = self.asi8 return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "The year of the datetime") + year = _field_accessor('year', 'Y', "\n The year of the datetime\n") month = _field_accessor('month', 'M', - "The month as January=1, December=12") - day = _field_accessor('day', 'D', "The days of the datetime") - hour = _field_accessor('hour', 'h', "The hours of the datetime") - minute = _field_accessor('minute', 'm', "The minutes of the datetime") - second = _field_accessor('second', 's', "The seconds of the datetime") + "\n The month as January=1, December=12 \n") + day = _field_accessor('day', 'D', "\nThe days of the datetime\n") + hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") + minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") + second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") microsecond = _field_accessor('microsecond', 'us', - "The microseconds of the datetime") + "\nThe microseconds of the datetime\n") nanosecond = _field_accessor('nanosecond', 'ns', - "The nanoseconds of the datetime") + "\nThe nanoseconds of the datetime\n") weekofyear = _field_accessor('weekofyear', 'woy', - "The week ordinal of the year") + "\nThe week ordinal of the year\n") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -935,55 +1122,67 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "The ordinal day of the year") - quarter = _field_accessor('quarter', 'q', "The quarter of the date") + "The ordinal day of the year.") + quarter = _field_accessor('quarter', 'q', "The quarter of the date.") days_in_month = _field_accessor( 'days_in_month', 'dim', - "The number of days in the month") + "The number of days in the month.") daysinmonth = days_in_month - is_month_start = _field_accessor( - 'is_month_start', - 'is_month_start', - "Logical indicating if first day of month (defined by frequency)") - is_month_end = _field_accessor( - 'is_month_end', - 'is_month_end', - """ - Indicator for whether the date is the last day of the month. + _is_month_doc = """ + Indicates whether the date is the {first_or_last} day of the month. Returns ------- Series or array - For Series, returns a Series with boolean values. For - DatetimeIndex, returns a boolean array. + For Series, returns a Series with boolean values. + For DatetimeIndex, returns a boolean array. See Also -------- - is_month_start : Indicator for whether the date is the first day - of the month. + is_month_start : Return a boolean indicating whether the date + is the first day of the month. + is_month_end : Return a boolean indicating whether the date + is the last day of the month. Examples -------- This method is available on Series with datetime values under the ``.dt`` accessor, and directly on DatetimeIndex. - >>> dates = pd.Series(pd.date_range("2018-02-27", periods=3)) - >>> dates + >>> s = pd.Series(pd.date_range("2018-02-27", periods=3)) + >>> s 0 2018-02-27 1 2018-02-28 2 2018-03-01 dtype: datetime64[ns] - >>> dates.dt.is_month_end + >>> s.dt.is_month_start + 0 False + 1 False + 2 True + dtype: bool + >>> s.dt.is_month_end 0 False 1 True 2 False dtype: bool >>> idx = pd.date_range("2018-02-27", periods=3) + >>> idx.is_month_start + array([False, False, True]) >>> idx.is_month_end - array([False, True, False], dtype=bool) - """) + array([False, True, False]) + """ + is_month_start = _field_accessor( + 'is_month_start', + 'is_month_start', + _is_month_doc.format(first_or_last='first')) + + is_month_end = _field_accessor( + 'is_month_end', + 'is_month_end', + _is_month_doc.format(first_or_last='last')) + is_quarter_start = _field_accessor( 'is_quarter_start', 'is_quarter_start', @@ -1223,10 +1422,262 @@ def to_julian_date(self): DatetimeArrayMixin._add_comparison_ops() -DatetimeArrayMixin._add_datetimelike_methods() + + +# ------------------------------------------------------------------- +# Constructor Helpers + +def sequence_to_dt64ns(data, dtype=None, copy=False, + tz=None, + dayfirst=False, yearfirst=False, ambiguous='raise'): + """ + Parameters + ---------- + data : list-like + dtype : dtype, str, or None, default None + copy : bool, default False + tz : tzinfo, str, or None, default None + dayfirst : bool, default False + yearfirst : bool, default False + ambiguous : str, bool, or arraylike, default 'raise' + See pandas._libs.tslibs.conversion.tz_localize_to_utc + + Returns + ------- + result : numpy.ndarray + The sequence converted to a numpy array with dtype ``datetime64[ns]``. + tz : tzinfo or None + Either the user-provided tzinfo or one inferred from the data. + inferred_freq : Tick or None + The inferred frequency of the sequence. + + Raises + ------ + TypeError : PeriodDType data is passed + """ + + inferred_freq = None + + if not hasattr(data, "dtype"): + # e.g. list, tuple + if np.ndim(data) == 0: + # i.e. generator + data = list(data) + data = np.asarray(data) + copy = False + elif isinstance(data, ABCSeries): + data = data._values + + if hasattr(data, "freq"): + # i.e. DatetimeArray/Index + inferred_freq = data.freq + + # if dtype has an embedded tz, capture it + tz = validate_tz_from_dtype(dtype, tz) + + # By this point we are assured to have either a numpy array or Index + data, copy = maybe_convert_dtype(data, copy) + + if is_object_dtype(data) or is_string_dtype(data): + # TODO: We do not have tests specific to string-dtypes, + # also complex or categorical or other extension + copy = False + if lib.infer_dtype(data) == 'integer': + data = data.astype(np.int64) + else: + # data comes back here as either i8 to denote UTC timestamps + # or M8[ns] to denote wall times + data, inferred_tz = objects_to_datetime64ns( + data, dayfirst=dayfirst, yearfirst=yearfirst) + tz = maybe_infer_tz(tz, inferred_tz) + + if is_datetime64tz_dtype(data): + tz = maybe_infer_tz(tz, data.tz) + result = data._data + + elif is_datetime64_dtype(data): + # tz-naive DatetimeArray/Index or ndarray[datetime64] + data = getattr(data, "_data", data) + if data.dtype != _NS_DTYPE: + data = conversion.ensure_datetime64ns(data) + + if tz is not None: + # Convert tz-naive to UTC + tz = timezones.maybe_get_tz(tz) + data = conversion.tz_localize_to_utc(data.view('i8'), tz, + ambiguous=ambiguous) + data = data.view(_NS_DTYPE) + + assert data.dtype == _NS_DTYPE, data.dtype + result = data + + else: + # must be integer dtype otherwise + # assume this data are epoch timestamps + if data.dtype != _INT64_DTYPE: + data = data.astype(np.int64, copy=False) + result = data.view(_NS_DTYPE) + + if copy: + # TODO: should this be deepcopy? + result = result.copy() + + assert isinstance(result, np.ndarray), type(result) + assert result.dtype == 'M8[ns]', result.dtype + + # We have to call this again after possibly inferring a tz above + validate_tz_from_dtype(dtype, tz) + + return result, tz, inferred_freq + + +def objects_to_datetime64ns(data, dayfirst, yearfirst, + utc=False, errors="raise", + require_iso8601=False, allow_object=False): + """ + Convert data to array of timestamps. + + Parameters + ---------- + data : np.ndarray[object] + dayfirst : bool + yearfirst : bool + utc : bool, default False + Whether to convert timezone-aware timestamps to UTC + errors : {'raise', 'ignore', 'coerce'} + allow_object : bool + Whether to return an object-dtype ndarray instead of raising if the + data contains more than one timezone. + + Returns + ------- + result : ndarray + np.int64 dtype if returned values represent UTC timestamps + np.datetime64[ns] if returned values represent wall times + object if mixed timezones + inferred_tz : tzinfo or None + + Raises + ------ + ValueError : if data cannot be converted to datetimes + """ + assert errors in ["raise", "ignore", "coerce"] + + # if str-dtype, convert + data = np.array(data, copy=False, dtype=np.object_) + + try: + result, tz_parsed = tslib.array_to_datetime( + data, + errors=errors, + utc=utc, + dayfirst=dayfirst, + yearfirst=yearfirst, + require_iso8601=require_iso8601 + ) + except ValueError as e: + try: + values, tz_parsed = conversion.datetime_to_datetime64(data) + # If tzaware, these values represent unix timestamps, so we + # return them as i8 to distinguish from wall times + return values.view('i8'), tz_parsed + except (ValueError, TypeError): + raise e + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + # Return i8 values to denote unix timestamps + return result.view('i8'), tz_parsed + elif is_datetime64_dtype(result): + # returning M8[ns] denotes wall-times; since tz is None + # the distinction is a thin one + return result, tz_parsed + elif is_object_dtype(result): + # GH#23675 when called via `pd.to_datetime`, returning an object-dtype + # array is allowed. When called via `pd.DatetimeIndex`, we can + # only accept datetime64 dtype, so raise TypeError if object-dtype + # is returned, as that indicates the values can be recognized as + # datetimes but they have conflicting timezones/awareness + if allow_object: + return result, tz_parsed + raise TypeError(result) + else: # pragma: no cover + # GH#23675 this TypeError should never be hit, whereas the TypeError + # in the object-dtype branch above is reachable. + raise TypeError(result) + + +def maybe_convert_dtype(data, copy): + """ + Convert data based on dtype conventions, issuing deprecation warnings + or errors where appropriate. + + Parameters + ---------- + data : np.ndarray or pd.Index + copy : bool + + Returns + ------- + data : np.ndarray or pd.Index + copy : bool + + Raises + ------ + TypeError : PeriodDType data is passed + """ + if is_float_dtype(data): + # Note: we must cast to datetime64[ns] here in order to treat these + # as wall-times instead of UTC timestamps. + data = data.astype(_NS_DTYPE) + copy = False + # TODO: deprecate this behavior to instead treat symmetrically + # with integer dtypes. See discussion in GH#23675 + + elif is_timedelta64_dtype(data): + warnings.warn("Passing timedelta64-dtype data is deprecated, will " + "raise a TypeError in a future version", + FutureWarning, stacklevel=5) + data = data.view(_NS_DTYPE) + + elif is_period_dtype(data): + # Note: without explicitly raising here, PeriondIndex + # test_setops.test_join_does_not_recur fails + raise TypeError("Passing PeriodDtype data is invalid. " + "Use `data.to_timestamp()` instead") + + elif is_extension_type(data) and not is_datetime64tz_dtype(data): + # Includes categorical + # TODO: We have no tests for these + data = np.array(data, dtype=np.object_) + copy = False + + return data, copy def _generate_regular_range(cls, start, end, periods, freq): + """ + Generate a range of dates with the spans between dates described by + the given `freq` DateOffset. + + Parameters + ---------- + cls : class + start : Timestamp or None + first point of produced date range + end : Timestamp or None + last point of produced date range + periods : int + number of periods in produced date range + freq : DateOffset + describes space between dates in produced date range + + Returns + ------- + ndarray[np.int64] representing nanosecond unix timestamps + + """ if isinstance(freq, Tick): stride = freq.nanos if periods is None: @@ -1239,35 +1690,151 @@ def _generate_regular_range(cls, start, end, periods, freq): tz = start.tz elif start is not None: b = Timestamp(start).value - e = b + np.int64(periods) * stride + e = _generate_range_overflow_safe(b, periods, stride, side='start') tz = start.tz elif end is not None: e = Timestamp(end).value + stride - b = e - np.int64(periods) * stride + b = _generate_range_overflow_safe(e, periods, stride, side='end') tz = end.tz else: raise ValueError("at least 'start' or 'end' should be specified " "if a 'period' is given.") - data = np.arange(b, e, stride, dtype=np.int64) - data = cls._simple_new(data.view(_NS_DTYPE), None, tz=tz) + values = np.arange(b, e, stride, dtype=np.int64) + else: tz = None # start and end should have the same timezone by this point - if isinstance(start, Timestamp): + if start is not None: tz = start.tz - elif isinstance(end, Timestamp): + elif end is not None: tz = end.tz xdr = generate_range(start=start, end=end, periods=periods, offset=freq) - values = np.array([x.value for x in xdr]) - data = cls._simple_new(values, freq=freq, tz=tz) + values = np.array([x.value for x in xdr], dtype=np.int64) + data = cls._simple_new(values, freq=freq, tz=tz) return data +def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): + """ + Calculate the second endpoint for passing to np.arange, checking + to avoid an integer overflow. Catch OverflowError and re-raise + as OutOfBoundsDatetime. + + Parameters + ---------- + endpoint : int + periods : int + stride : int + side : {'start', 'end'} + + Returns + ------- + other_end : int + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#14187 raise instead of incorrectly wrapping around + assert side in ['start', 'end'] + if side == 'end': + stride *= -1 + + try: + other_end = checked_add_with_arr(np.int64(endpoint), + np.int64(periods) * stride) + except OverflowError: + raise tslib.OutOfBoundsDatetime('Cannot generate range with ' + '{side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, + periods=periods)) + return other_end + + +# ------------------------------------------------------------------- +# Validation and Inference + +def maybe_infer_tz(tz, inferred_tz): + """ + If a timezone is inferred from data, check that it is compatible with + the user-provided timezone, if any. + + Parameters + ---------- + tz : tzinfo or None + inferred_tz : tzinfo or None + + Returns + ------- + tz : tzinfo or None + + Raises + ------ + TypeError : if both timezones are present but do not match + """ + if tz is None: + tz = inferred_tz + elif inferred_tz is None: + pass + elif not timezones.tz_compare(tz, inferred_tz): + raise TypeError('data is already tz-aware {inferred_tz}, unable to ' + 'set specified tz: {tz}' + .format(inferred_tz=inferred_tz, tz=tz)) + return tz + + +def validate_tz_from_dtype(dtype, tz): + """ + If the given dtype is a DatetimeTZDtype, extract the implied + tzinfo object from it and check that it does not conflict with the given + tz. + + Parameters + ---------- + dtype : dtype, str + tz : None, tzinfo + + Returns + ------- + tz : consensus tzinfo + + Raises + ------ + ValueError : on tzinfo mismatch + """ + if dtype is not None: + if isinstance(dtype, compat.string_types): + try: + dtype = DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + # Things like `datetime64[ns]`, which is OK for the + # constructors, but also nonsense, which should be validated + # but not by us. We *do* allow non-existent tz errors to + # go through + pass + dtz = getattr(dtype, 'tz', None) + if dtz is not None: + if tz is not None and not timezones.tz_compare(tz, dtz): + raise ValueError("cannot supply both a tz and a dtype" + " with a tz") + tz = dtz + + if tz is not None and is_datetime64_dtype(dtype): + # We also need to check for the case where the user passed a + # tz-naive dtype (i.e. datetime64[ns]) + if tz is not None and not timezones.tz_compare(tz, dtz): + raise ValueError("cannot supply both a tz and a " + "timezone-naive dtype (i.e. datetime64[ns]") + + return tz + + def _infer_tz_from_endpoints(start, end, tz): """ If a timezone is not explicitly given via `tz`, see if one can diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e58109a25e1a5..38dc68e8f77a3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,29 +1,24 @@ +import copy import sys import warnings -import copy + import numpy as np -from pandas._libs.lib import infer_dtype +from pandas._libs import lib +from pandas.compat import range, set_function_name, string_types from pandas.util._decorators import cache_readonly -from pandas.compat import u, range, string_types -from pandas.compat import set_function_name +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( - is_integer, is_scalar, is_float, - is_bool_dtype, - is_float_dtype, - is_integer_dtype, - is_object_dtype, - is_list_like) -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -from pandas.core.dtypes.base import ExtensionDtype + is_bool_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype, + is_list_like, is_object_dtype, is_scalar) from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna -from pandas.io.formats.printing import ( - format_object_summary, format_object_attrs, default_pprint) +from pandas.core import nanops +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin class _IntegerDtype(ExtensionDtype): @@ -60,6 +55,11 @@ def numpy_dtype(self): def kind(self): return self.numpy_dtype.kind + @cache_readonly + def itemsize(self): + """ Return the number of bytes in this dtype """ + return self.numpy_dtype.itemsize + @classmethod def construct_array_type(cls): """Return the array type associated with this dtype @@ -170,9 +170,12 @@ def coerce_to_array(values, dtype, mask=None, copy=False): values = np.array(values, copy=copy) if is_object_dtype(values): - inferred_type = infer_dtype(values) - if inferred_type not in ['floating', 'integer', - 'mixed-integer', 'mixed-integer-float']: + inferred_type = lib.infer_dtype(values) + if inferred_type is 'mixed' and isna(values).all(): + values = np.empty(len(values)) + values.fill(np.nan) + elif inferred_type not in ['floating', 'integer', + 'mixed-integer', 'mixed-integer-float']: raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) @@ -262,6 +265,13 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) + def _formatter(self, boxed=False): + def fmt(x): + if isna(x): + return 'NaN' + return str(x) + return fmt + def __getitem__(self, item): if is_integer(item): if self._mask[item]: @@ -279,6 +289,8 @@ def _coerce_to_ndarray(self): data[self._mask] = self._na_value return data + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + def __array__(self, dtype=None): """ the array interface, return my values @@ -287,22 +299,12 @@ def __array__(self, dtype=None): return self._coerce_to_ndarray() def __iter__(self): - """Iterate over elements of the array. - - """ - # This needs to be implemented so that pandas recognizes extension - # arrays as list-like. The default implementation makes successive - # calls to ``__getitem__``, which may be slower than necessary. for i in range(len(self)): if self._mask[i]: yield self.dtype.na_value else: yield self._data[i] - def _formatting_values(self): - # type: () -> np.ndarray - return self._coerce_to_ndarray() - def take(self, indexer, allow_fill=False, fill_value=None): from pandas.api.extensions import take @@ -352,25 +354,6 @@ def __setitem__(self, key, value): def __len__(self): return len(self._data) - def __repr__(self): - """ - Return a string representation for this object. - - Invoked by unicode(df) in py2 only. Yields a Unicode String in both - py2/py3. - """ - klass = self.__class__.__name__ - data = format_object_summary(self, default_pprint, False) - attrs = format_object_attrs(self) - space = " " - - prepr = (u(",%s") % - space).join(u("%s=%s") % (k, v) for k, v in attrs) - - res = u("%s(%s%s)") % (klass, data, prepr) - - return res - @property def nbytes(self): return self._data.nbytes + self._mask.nbytes @@ -389,7 +372,8 @@ def _concat_same_type(cls, to_concat): return cls(data, mask) def astype(self, dtype, copy=True): - """Cast to a NumPy array or IntegerArray with 'dtype'. + """ + Cast to a NumPy array or IntegerArray with 'dtype'. Parameters ---------- @@ -503,13 +487,21 @@ def cmp_method(self, other): op_name = op.__name__ mask = None + + if isinstance(other, (ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + if isinstance(other, IntegerArray): other, mask = other._data, other._mask + elif is_list_like(other): other = np.asarray(other) if other.ndim > 0 and len(self) != len(other): raise ValueError('Lengths must match to compare') + other = lib.item_from_zerodim(other) + # numpy will show a DeprecationWarning on invalid elementwise # comparisons, this will raise in the future with warnings.catch_warnings(): @@ -529,6 +521,31 @@ def cmp_method(self, other): name = '__{name}__'.format(name=op.__name__) return set_function_name(cmp_method, name, cls) + def _reduce(self, name, skipna=True, **kwargs): + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if mask.any(): + data = self._data.astype('float64') + data[mask] = self._na_value + + op = getattr(nanops, 'nan' + name) + result = op(data, axis=0, skipna=skipna, mask=mask) + + # if we have a boolean op, don't coerce + if name in ['any', 'all']: + pass + + # if we have a preservable numeric op, + # provide coercion back to an integer type if possible + elif name in ['sum', 'min', 'max', 'prod'] and notna(result): + int_result = int(result) + if int_result == result: + result = int_result + + return result + def _maybe_mask_result(self, result, mask, other, op_name): """ Parameters @@ -560,14 +577,21 @@ def integer_arithmetic_method(self, other): op_name = op.__name__ mask = None + if isinstance(other, (ABCSeries, ABCIndexClass)): - other = getattr(other, 'values', other) + # Rely on pandas to unbox and dispatch to us. + return NotImplemented - if isinstance(other, IntegerArray): - other, mask = other._data, other._mask - elif getattr(other, 'ndim', 0) > 1: + if getattr(other, 'ndim', 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") + + if isinstance(other, IntegerArray): + other, mask = other._data, other._mask + + elif getattr(other, 'ndim', None) == 0: + other = other.item() + elif is_list_like(other): other = np.asarray(other) if not other.ndim: @@ -586,6 +610,13 @@ def integer_arithmetic_method(self, other): else: mask = self._mask | mask + # 1 ** np.nan is 1. So we have to unmask those. + if op_name == 'pow': + mask = np.where(self == 1, False, mask) + + elif op_name == 'rpow': + mask = np.where(other == 1, False, mask) + with np.errstate(all='ignore'): result = op(self._data, other) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 134999f05364f..785fb02c4d95d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,39 +1,44 @@ +from operator import le, lt import textwrap + import numpy as np -from pandas._libs.interval import (Interval, IntervalMixin, - intervals_to_interval_bounds) +from pandas._libs.interval import ( + Interval, IntervalMixin, intervals_to_interval_bounds) from pandas.compat import add_metaclass from pandas.compat.numpy import function as nv -import pandas.core.common as com -from pandas.core.config import get_option +from pandas.util._decorators import Appender +from pandas.util._doctools import _WritableDoc + from pandas.core.dtypes.cast import maybe_convert_platform -from pandas.core.dtypes.common import (is_categorical_dtype, is_float_dtype, - is_integer_dtype, is_interval_dtype, - is_scalar, is_string_dtype, - is_datetime64_any_dtype, - is_timedelta64_dtype, is_interval, - pandas_dtype) +from pandas.core.dtypes.common import ( + is_categorical_dtype, is_datetime64_any_dtype, is_float_dtype, + is_integer_dtype, is_interval, is_interval_dtype, is_scalar, + is_string_dtype, is_timedelta64_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import IntervalDtype -from pandas.core.dtypes.generic import (ABCDatetimeIndex, ABCPeriodIndex, - ABCSeries, ABCIntervalIndex, - ABCInterval) +from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries) from pandas.core.dtypes.missing import isna, notna + +import pandas.core.common as com +from pandas.core.config import get_option from pandas.core.indexes.base import Index, ensure_index -from pandas.util._decorators import Appender -from pandas.util._doctools import _WritableDoc -from . import ExtensionArray, Categorical +from . import Categorical, ExtensionArray _VALID_CLOSED = {'left', 'right', 'both', 'neither'} _interval_shared_docs = {} + +# TODO(jschendel) remove constructor key when IntervalArray is public (GH22860) _shared_docs_kwargs = dict( klass='IntervalArray', + constructor='pd.core.arrays.IntervalArray', name='' ) -_interval_shared_docs['class'] = """%(summary)s +_interval_shared_docs['class'] = """ +%(summary)s .. versionadded:: %(versionadded)s @@ -50,13 +55,15 @@ closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. -%(name)s\ -copy : boolean, default False - Copy the meta-data. dtype : dtype or None, default None - If None, dtype will be inferred + If None, dtype will be inferred. .. versionadded:: 0.23.0 +copy : bool, default False + Copy the input data. +%(name)s\ +verify_integrity : bool, default True + Verify that the %(klass)s is valid. Attributes ---------- @@ -67,6 +74,7 @@ length values is_non_overlapping_monotonic +%(extra_attributes)s\ Methods ------- @@ -86,19 +94,37 @@ See Also -------- -Index : The base pandas Index type -Interval : A bounded slice-like interval; the elements of an IntervalIndex -interval_range : Function to create a fixed frequency IntervalIndex -cut, qcut : Convert arrays of continuous data into Categoricals/Series of - Intervals +Index : The base pandas Index type. +Interval : A bounded slice-like interval; the elements of an %(klass)s. +interval_range : Function to create a fixed frequency IntervalIndex. +cut : Bin values into discrete Intervals. +qcut : Bin values into equal-sized Intervals based on rank or sample quantiles. """ +# TODO(jschendel) use a more direct call in Examples when made public (GH22860) @Appender(_interval_shared_docs['class'] % dict( klass="IntervalArray", - summary="Pandas array for interval data that are closed on the same side", + summary="Pandas array for interval data that are closed on the same side.", versionadded="0.24.0", - name='', extra_methods='', examples='', + name='', + extra_attributes='', + extra_methods='', + examples=textwrap.dedent("""\ + Examples + -------- + A new ``IntervalArray`` can be constructed directly from an array-like of + ``Interval`` objects: + + >>> pd.core.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + IntervalArray([(0, 1], (1, 5]], + closed='right', + dtype='interval[int64]') + + It may also be constructed using one of the constructor + methods: :meth:`IntervalArray.from_arrays`, + :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. + """), )) @add_metaclass(_WritableDoc) class IntervalArray(IntervalMixin, ExtensionArray): @@ -219,9 +245,9 @@ def _from_factorized(cls, values, original): See Also -------- - interval_range : Function to create a fixed frequency IntervalIndex - %(klass)s.from_arrays : Construct from a left and right array - %(klass)s.from_tuples : Construct from a sequence of tuples + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct from a left and right array. + %(klass)s.from_tuples : Construct from a sequence of tuples. """ @classmethod @@ -329,13 +355,13 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): See Also -------- - interval_range : Function to create a fixed frequency IntervalIndex + interval_range : Function to create a fixed frequency IntervalIndex. %(klass)s.from_arrays : Construct an %(klass)s from a left and - right array + right array. %(klass)s.from_breaks : Construct an %(klass)s from an array of - splits + splits. %(klass)s.from_tuples : Construct an %(klass)s from an - array-like of tuples + array-like of tuples. """ _interval_shared_docs['from_tuples'] = """ @@ -364,11 +390,11 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): See Also -------- - interval_range : Function to create a fixed frequency IntervalIndex + interval_range : Function to create a fixed frequency IntervalIndex. %(klass)s.from_arrays : Construct an %(klass)s from a left and - right array + right array. %(klass)s.from_breaks : Construct an %(klass)s from an array of - splits + splits. """ @classmethod @@ -664,9 +690,6 @@ def copy(self, deep=False): # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) - def _formatting_values(self): - return np.asarray(self) - def isna(self): return isna(self.left) @@ -1002,14 +1025,75 @@ def repeat(self, repeats, **kwargs): See Also -------- - Index.repeat : Equivalent function for Index - Series.repeat : Equivalent function for Series - numpy.repeat : Underlying implementation + Index.repeat : Equivalent function for Index. + Series.repeat : Equivalent function for Series. + numpy.repeat : Underlying implementation. """ left_repeat = self.left.repeat(repeats, **kwargs) right_repeat = self.right.repeat(repeats, **kwargs) return self._shallow_copy(left=left_repeat, right=right_repeat) + _interval_shared_docs['overlaps'] = """ + Check elementwise if an Interval overlaps the values in the %(klass)s. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + other : Interval + Interval to check against for an overlap. + + Returns + ------- + ndarray + Boolean array positionally indicating where an overlap occurs. + + Examples + -------- + >>> intervals = %(constructor)s.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + %(klass)s([(0, 1], (1, 3], (2, 4]], + closed='right', + dtype='interval[int64]') + >>> intervals.overlaps(pd.Interval(0.5, 1.5)) + array([ True, True, False]) + + Intervals that share closed endpoints overlap: + + >>> intervals.overlaps(pd.Interval(1, 3, closed='left')) + array([ True, True, True]) + + Intervals that only have an open endpoint in common do not overlap: + + >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) + array([False, True, False]) + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + """ + + @Appender(_interval_shared_docs['overlaps'] % _shared_docs_kwargs) + def overlaps(self, other): + if isinstance(other, (IntervalArray, ABCIntervalIndex)): + raise NotImplementedError + elif not isinstance(other, Interval): + msg = '`other` must be Interval-like, got {other}' + raise TypeError(msg.format(other=type(other).__name__)) + + # equality is okay if both endpoints are closed (overlap at a point) + op1 = le if (self.closed_left and other.closed_right) else lt + op2 = le if (other.closed_left and self.closed_right) else lt + + # overlaps is equivalent negation of two interval being disjoint: + # disjoint = (A.left > B.right) or (B.left > A.right) + # (simplifying the negation allows this to be done in less operations) + return op1(self.left, other.right) & op2(other.left, self.right) + def maybe_convert_platform_interval(values): """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 92803ab5f52e0..d9dde1c699761 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -1,38 +1,42 @@ # -*- coding: utf-8 -*- from datetime import timedelta -import warnings +import operator import numpy as np -from pandas._libs import lib -from pandas._libs.tslib import NaT, iNaT -from pandas._libs.tslibs.period import ( - Period, IncompatibleFrequency, DIFFERENT_FREQ_INDEX, - get_period_field_arr, period_asfreq_arr) -from pandas._libs.tslibs import period as libperiod -from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds +from pandas._libs.tslibs import NaT, iNaT, period as libperiod from pandas._libs.tslibs.fields import isleapyear_arr - -from pandas import compat -from pandas.util._decorators import cache_readonly +from pandas._libs.tslibs.period import ( + DIFFERENT_FREQ_INDEX, IncompatibleFrequency, Period, get_period_field_arr, + period_asfreq_arr) +from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, cache_readonly +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( - is_integer_dtype, is_float_dtype, is_period_dtype) + _TD_DTYPE, ensure_object, is_array_like, is_categorical_dtype, + is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, + is_float_dtype, is_integer_dtype, is_list_like, is_object_dtype, + is_period_dtype, is_string_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import PeriodDtype -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodIndex, ABCSeries +from pandas.core.dtypes.missing import isna, notna +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray, datetimelike as dtl import pandas.core.common as com +from pandas.core.missing import backfill_1d, pad_1d from pandas.tseries import frequencies -from pandas.tseries.offsets import Tick, DateOffset - -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.tseries.offsets import Tick def _field_accessor(name, alias, docstring=None): def f(self): base, mult = frequencies.get_freq_code(self.freq) - result = get_period_field_arr(alias, self._ndarray_values, base) + result = get_period_field_arr(alias, self.asi8, base) return result f.__name__ = name @@ -48,19 +52,29 @@ def _period_array_cmp(cls, op): nat_result = True if opname == '__ne__' else False def wrapper(self, other): - op = getattr(self._ndarray_values, opname) + op = getattr(self.asi8, opname) + # We want to eventually defer to the Series or PeriodIndex (which will + # return here with an unboxed PeriodArray). But before we do that, + # we do a bit of validation on type (Period) and freq, so that our + # error messages are sensible + not_implemented = isinstance(other, (ABCSeries, ABCIndexClass)) + if not_implemented: + other = other._values + if isinstance(other, Period): if other.freq != self.freq: msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) result = op(other.ordinal) - elif isinstance(other, PeriodArrayMixin): + elif isinstance(other, cls): if other.freq != self.freq: msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = op(other._ndarray_values) + if not_implemented: + return NotImplemented + result = op(other.asi8) mask = self._isnan | other._isnan if mask.any(): @@ -68,7 +82,7 @@ def wrapper(self, other): return result elif other is NaT: - result = np.empty(len(self._ndarray_values), dtype=bool) + result = np.empty(len(self.asi8), dtype=bool) result.fill(nat_result) else: other = Period(other, freq=self.freq) @@ -82,92 +96,139 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class PeriodArrayMixin(DatetimeLikeArrayMixin): - @property - def _box_func(self): - return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) - - @cache_readonly - def dtype(self): - return PeriodDtype.construct_from_string(self.freq) +class PeriodArray(dtl.DatetimeLikeArrayMixin, ExtensionArray): + """ + Pandas ExtensionArray for storing Period data. + + Users should use :func:`period_array` to create new instances. + + Parameters + ---------- + values : Union[PeriodArray, Series[period], ndarary[int], PeriodIndex] + The data to store. These should be arrays that can be directly + converted to ordinals without inference or copy (PeriodArray, + ndarray[int64]), or a box around such an array (Series[period], + PeriodIndex). + freq : str or DateOffset + The `freq` to use for the array. Mostly applicable when `values` + is an ndarray of integers, when `freq` is required. When `values` + is a PeriodArray (or box around), it's checked that ``values.freq`` + matches `freq`. + copy : bool, default False + Whether to copy the ordinals before storing. + + Notes + ----- + There are two components to a PeriodArray + + - ordinals : integer ndarray + - freq : pd.tseries.offsets.Offset + + The values are physically stored as a 1-D ndarray of integers. These are + called "ordinals" and represent some kind of offset from a base. + + The `freq` indicates the span covered by each element of the array. + All elements in the PeriodArray have the same `freq`. + + See Also + -------- + period_array : Create a new PeriodArray. + pandas.PeriodIndex : Immutable Index for period data. + """ + # array priority higher than numpy scalars + __array_priority__ = 1000 + _attributes = ["freq"] + _typ = "periodarray" # ABCPeriodArray + + # Names others delegate to us + _other_ops = [] + _bool_ops = ['is_leap_year'] + _object_ops = ['start_time', 'end_time', 'freq'] + _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second', + 'weekofyear', 'weekday', 'week', 'dayofweek', + 'dayofyear', 'quarter', 'qyear', + 'days_in_month', 'daysinmonth'] + _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _datetimelike_methods = ['strftime', 'to_timestamp', 'asfreq'] - @property - def _ndarray_values(self): - # Ordinals - return self._data + # -------------------------------------------------------------------- + # Constructors - @property - def asi8(self): - return self._ndarray_values.view('i8') + def __init__(self, values, freq=None, dtype=None, copy=False): + freq = validate_dtype_freq(dtype, freq) - @property - def freq(self): - """Return the frequency object if it is set, otherwise None""" - return self._freq - - @freq.setter - def freq(self, value): - msg = ('Setting {cls}.freq has been deprecated and will be ' - 'removed in a future version; use {cls}.asfreq instead. ' - 'The {cls}.freq setter is not guaranteed to work.') - warnings.warn(msg.format(cls=type(self).__name__), - FutureWarning, stacklevel=2) - self._freq = value + if freq is not None: + freq = Period._maybe_convert_freq(freq) - # -------------------------------------------------------------------- - # Constructors + if isinstance(values, ABCSeries): + values = values._values + if not isinstance(values, type(self)): + raise TypeError("Incorrect dtype") - _attributes = ["freq"] + elif isinstance(values, ABCPeriodIndex): + values = values._values - def __new__(cls, values, freq=None, **kwargs): - if is_period_dtype(values): - # PeriodArray, PeriodIndex - if freq is not None and values.freq != freq: - raise IncompatibleFrequency(freq, values.freq) - freq = values.freq - values = values.asi8 + if isinstance(values, type(self)): + if freq is not None and freq != values.freq: + msg = DIFFERENT_FREQ_INDEX.format(values.freq.freqstr, + freq.freqstr) + raise IncompatibleFrequency(msg) + values, freq = values._data, values.freq - return cls._simple_new(values, freq, **kwargs) + values = np.array(values, dtype='int64', copy=copy) + self._data = values + if freq is None: + raise ValueError('freq is not specified and cannot be inferred') + self._dtype = PeriodDtype(freq) @classmethod def _simple_new(cls, values, freq=None, **kwargs): - """ - Values can be any type that can be coerced to Periods. - Ordinals in an ndarray are fastpath-ed to `_from_ordinals` - """ + # TODO(DatetimeArray): remove once all constructors are aligned. + # alias from PeriodArray.__init__ + return cls(values, freq=freq, **kwargs) - if not is_integer_dtype(values): - values = np.array(values, copy=False) - if len(values) > 0 and is_float_dtype(values): - raise TypeError("{cls} can't take floats" - .format(cls=cls.__name__)) - return cls(values, freq=freq) + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + # type: (Sequence[Optional[Period]], PeriodDtype, bool) -> PeriodArray + if dtype: + freq = dtype.freq + else: + freq = None + periods = np.asarray(scalars, dtype=object) + if copy: + periods = periods.copy() - return cls._from_ordinals(values, freq) + freq = freq or libperiod.extract_freq(periods) + ordinals = libperiod.extract_ordinals(periods, freq) + return cls(ordinals, freq=freq) @classmethod - def _from_ordinals(cls, values, freq=None): - """ - Values should be int ordinals - `__new__` & `_simple_new` cooerce to ordinals and call this method + def _from_datetime64(cls, data, freq, tz=None): """ + Construct a PeriodArray from a datetime64 array - values = np.array(values, dtype='int64', copy=False) + Parameters + ---------- + data : ndarray[datetime64[ns], datetime64[ns, tz]] + freq : str or Tick + tz : tzinfo, optional - result = object.__new__(cls) - result._data = values - if freq is None: - raise ValueError('freq is not specified and cannot be inferred') - result._freq = Period._maybe_convert_freq(freq) - return result + Returns + ------- + PeriodArray[freq] + """ + data, freq = dt64arr_to_periodarr(data, freq, tz) + return cls(data, freq=freq) @classmethod def _generate_range(cls, start, end, periods, freq, fields): + periods = dtl.validate_periods(periods) + if freq is not None: freq = Period._maybe_convert_freq(freq) field_count = len(fields) - if com.count_not_none(start, end) > 0: + if start is not None or end is not None: if field_count > 0: raise ValueError('Can either instantiate from fields ' 'or endpoints, but not both') @@ -180,6 +241,25 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq + # -------------------------------------------------------------------- + # Data / Attributes + + @cache_readonly + def dtype(self): + return self._dtype + + @property + def _ndarray_values(self): + # Ordinals + return self._data + + @property + def freq(self): + """ + Return the frequency object for this PeriodArray. + """ + return self.dtype.freq + # -------------------------------------------------------------------- # Vectorized analogues of Period properties @@ -204,9 +284,231 @@ def _generate_range(cls, start, end, periods, freq, fields): @property def is_leap_year(self): - """ Logical indicating if the date belongs to a leap year """ + """ + Logical indicating if the date belongs to a leap year + """ return isleapyear_arr(np.asarray(self.year)) + @property + def start_time(self): + return self.to_timestamp(how='start') + + @property + def end_time(self): + return self.to_timestamp(how='end') + + def to_timestamp(self, freq=None, how='start'): + """ + Cast to DatetimeArray/Index. + + Parameters + ---------- + freq : string or DateOffset, optional + Target frequency. The default is 'D' for week or longer, + 'S' otherwise + how : {'s', 'e', 'start', 'end'} + + Returns + ------- + DatetimeArray/Index + """ + from pandas.core.arrays import DatetimeArrayMixin + + how = libperiod._validate_end_alias(how) + + end = how == 'E' + if end: + if freq == 'B': + # roll forward to ensure we land on B date + adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') + return self.to_timestamp(how='start') + adjust + else: + adjust = Timedelta(1, 'ns') + return (self + self.freq).to_timestamp(how='start') - adjust + + if freq is None: + base, mult = frequencies.get_freq_code(self.freq) + freq = frequencies.get_to_timestamp_base(base) + else: + freq = Period._maybe_convert_freq(freq) + + base, mult = frequencies.get_freq_code(freq) + new_data = self.asfreq(freq, how=how) + + new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) + return DatetimeArrayMixin(new_data, freq='infer') + + # -------------------------------------------------------------------- + # Array-like / EA-Interface Methods + + def _formatter(self, boxed=False): + if boxed: + return str + return "'{}'".format + + def __setitem__( + self, + key, # type: Union[int, Sequence[int], Sequence[bool], slice] + value # type: Union[NaTType, Period, Sequence[Period]] + ): + # type: (...) -> None + # n.b. the type on `value` is a bit too restrictive. + # we also accept a sequence of stuff coercible to a PeriodArray + # by period_array, which includes things like ndarray[object], + # ndarray[datetime64ns]. I think ndarray[int] / ndarray[str] won't + # work, since the freq can't be inferred. + if is_list_like(value): + is_slice = isinstance(key, slice) + if (not is_slice + and len(key) != len(value) + and not com.is_bool_indexer(key)): + msg = ("shape mismatch: value array of length '{}' does not " + "match indexing result of length '{}'.") + raise ValueError(msg.format(len(key), len(value))) + if not is_slice and len(key) == 0: + return + + value = period_array(value) + + if self.freqstr != value.freqstr: + msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, value.freqstr) + raise IncompatibleFrequency(msg) + + value = value.asi8 + elif isinstance(value, Period): + + if self.freqstr != value.freqstr: + msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, value.freqstr) + raise IncompatibleFrequency(msg) + + value = value.ordinal + elif isna(value): + value = iNaT + else: + msg = ("'value' should be a 'Period', 'NaT', or array of those. " + "Got '{}' instead.".format(type(value).__name__)) + raise TypeError(msg) + self._data[key] = value + + @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) + def _validate_fill_value(self, fill_value): + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, Period): + if fill_value.freq != self.freq: + msg = DIFFERENT_FREQ_INDEX.format(self.freq.freqstr, + fill_value.freqstr) + raise IncompatibleFrequency(msg) + fill_value = fill_value.ordinal + else: + raise ValueError("'fill_value' should be a Period. " + "Got '{got}'.".format(got=fill_value)) + return fill_value + + def fillna(self, value=None, method=None, limit=None): + # TODO(#20300) + # To avoid converting to object, we re-implement here with the changes + # 1. Passing `_data` to func instead of self.astype(object) + # 2. Re-boxing output of 1. + # #20300 should let us do this kind of logic on ExtensionArray.fillna + # and we can use it. + + if isinstance(value, ABCSeries): + value = value._values + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError("Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self))) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == 'pad' else backfill_1d + new_values = func(self._data, limit=limit, + mask=mask) + new_values = type(self)(new_values, freq=self.freq) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + def value_counts(self, dropna=False): + from pandas import Series, PeriodIndex + + if dropna: + values = self[~self.isna()]._data + else: + values = self._data + + cls = type(self) + + result = algos.value_counts(values, sort=False) + index = PeriodIndex(cls(result.index, freq=self.freq), + name=result.index.name) + return Series(result.values, index=index, name=result.name) + + # -------------------------------------------------------------------- + + def shift(self, periods=1): + """ + Shift values by desired number. + + Newly introduced missing values are filled with + ``self.dtype.na_value``. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + periods : int, default 1 + The number of periods to shift. Negative values are allowed + for shifting backwards. + + Returns + ------- + shifted : PeriodArray + """ + # TODO(DatetimeArray): remove + # The semantics for Index.shift differ from EA.shift + # then just call super. + return ExtensionArray.shift(self, periods) + + def _time_shift(self, n, freq=None): + """ + Shift each value by `periods`. + + Note this is different from ExtensionArray.shift, which + shifts the *position* of each element, padding the end with + missing values. + + Parameters + ---------- + periods : int + Number of periods to shift by. + freq : pandas.DateOffset, pandas.Timedelta, or string + Frequency increment to shift by. + """ + if freq is not None: + raise TypeError("`freq` argument is not supported for " + "{cls}._time_shift" + .format(cls=type(self).__name__)) + values = self.asi8 + n * self.freq.n + if self.hasnans: + values[self._isnan] = iNaT + return type(self)(values, freq=self.freq) + + @property + def _box_func(self): + return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) + def asfreq(self, freq=None, how='E'): """ Convert the Period Array/Index to the specified frequency `freq`. @@ -264,11 +566,98 @@ def asfreq(self, freq=None, how='E'): if self.hasnans: new_data[self._isnan] = iNaT - return self._shallow_copy(new_data, freq=freq) + return type(self)(new_data, freq=freq) # ------------------------------------------------------------------ - # Arithmetic Methods + # Formatting + + def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): + """ + actually format my specific types + """ + # TODO(DatetimeArray): remove + values = self.astype(object) + + if date_format: + formatter = lambda dt: dt.strftime(date_format) + else: + formatter = lambda dt: u'%s' % dt + + if self.hasnans: + mask = self._isnan + values[mask] = na_rep + imask = ~mask + values[imask] = np.array([formatter(dt) for dt + in values[imask]]) + else: + values = np.array([formatter(dt) for dt in values]) + return values + + # Delegation... + def strftime(self, date_format): + return self._format_native_types(date_format=date_format) + + def repeat(self, repeats, *args, **kwargs): + """ + Repeat elements of a PeriodArray. + See Also + -------- + numpy.ndarray.repeat + """ + # TODO(DatetimeArray): remove + nv.validate_repeat(args, kwargs) + values = self._data.repeat(repeats) + return type(self)(values, self.freq) + + def astype(self, dtype, copy=True): + # TODO: Figure out something better here... + # We have DatetimeLikeArrayMixin -> + # super(...), which ends up being... DatetimeIndexOpsMixin? + # this is complicated. + # need a pandas_astype(arr, dtype). + from pandas import Categorical + + dtype = pandas_dtype(dtype) + + if is_object_dtype(dtype): + return np.asarray(self, dtype=object) + elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): + return self._format_native_types() + elif is_integer_dtype(dtype): + values = self._data + + if values.dtype != dtype: + # int32 vs. int64 + values = values.astype(dtype) + + elif copy: + values = values.copy() + + return values + elif (is_datetime_or_timedelta_dtype(dtype) and + not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): + # disallow conversion between datetime/timedelta, + # and conversions for any datetimelike to float + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + elif is_categorical_dtype(dtype): + return Categorical(self, dtype=dtype) + elif is_period_dtype(dtype): + return self.asfreq(dtype.freq) + else: + return np.asarray(self, dtype=dtype) + + @property + def flags(self): + # TODO: remove + # We need this since reduction.SeriesBinGrouper uses values.flags + # Ideally, we wouldn't be passing objects down there in the first + # place. + return self._data.flags + + # ------------------------------------------------------------------ + # Arithmetic Methods _create_comparison_method = classmethod(_period_array_cmp) def _sub_datelike(self, other): @@ -291,110 +680,312 @@ def _sub_period(self, other): return new_data + @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__) + def _addsub_int_array( + self, + other, # type: Union[Index, ExtensionArray, np.ndarray[int]] + op # type: Callable[Any, Any] + ): + # type: (...) -> PeriodArray + + assert op in [operator.add, operator.sub] + if op is operator.sub: + other = -other + res_values = algos.checked_add_with_arr(self.asi8, other, + arr_mask=self._isnan) + res_values = res_values.view('i8') + res_values[self._isnan] = iNaT + return type(self)(res_values, freq=self.freq) + def _add_offset(self, other): assert not isinstance(other, Tick) base = frequencies.get_base_alias(other.rule_code) if base != self.freq.rule_code: msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - return self._time_shift(other.n) - def _add_delta_td(self, other): + # Note: when calling parent class's _add_timedeltalike_scalar, + # it will call delta_to_nanoseconds(delta). Because delta here + # is an integer, delta_to_nanoseconds will return it unchanged. + result = super(PeriodArray, self)._add_timedeltalike_scalar(other.n) + return type(self)(result, freq=self.freq) + + def _add_timedeltalike_scalar(self, other): + """ + Parameters + ---------- + other : timedelta, Tick, np.timedelta64 + + Returns + ------- + result : ndarray[int64] + """ + assert isinstance(self.freq, Tick) # checked by calling function assert isinstance(other, (timedelta, np.timedelta64, Tick)) - nanos = delta_to_nanoseconds(other) - own_offset = frequencies.to_offset(self.freq.rule_code) - if isinstance(own_offset, Tick): - offset_nanos = delta_to_nanoseconds(own_offset) - if np.all(nanos % offset_nanos == 0): - return self._time_shift(nanos // offset_nanos) + if notna(other): + # special handling for np.timedelta64("NaT"), avoid calling + # _check_timedeltalike_freq_compat as that would raise TypeError + other = self._check_timedeltalike_freq_compat(other) - # raise when input doesn't have freq - raise IncompatibleFrequency("Input has different freq from " - "{cls}(freq={freqstr})" - .format(cls=type(self).__name__, - freqstr=self.freqstr)) + # Note: when calling parent class's _add_timedeltalike_scalar, + # it will call delta_to_nanoseconds(delta). Because delta here + # is an integer, delta_to_nanoseconds will return it unchanged. + ordinals = super(PeriodArray, self)._add_timedeltalike_scalar(other) + return ordinals - def _add_delta(self, other): - ordinal_delta = self._maybe_convert_timedelta(other) - return self._time_shift(ordinal_delta) + def _add_delta_tdi(self, other): + """ + Parameters + ---------- + other : TimedeltaArray or ndarray[timedelta64] + + Returns + ------- + result : ndarray[int64] + """ + assert isinstance(self.freq, Tick) # checked by calling function + + delta = self._check_timedeltalike_freq_compat(other) + return self._addsub_int_array(delta, operator.add).asi8 - def shift(self, n): + def _add_delta(self, other): """ - Specialized shift which produces an Period Array/Index + Add a timedelta-like, Tick, or TimedeltaIndex-like object + to self, yielding a new PeriodArray Parameters ---------- - n : int - Periods to shift by + other : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} Returns ------- - shifted : Period Array/Index + result : PeriodArray """ - return self._time_shift(n) + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise IncompatibleFrequency("Input has different freq from " + "{cls}(freq={freqstr})" + .format(cls=type(self).__name__, + freqstr=self.freqstr)) - def _time_shift(self, n): - values = self._ndarray_values + n * self.freq.n - if self.hasnans: - values[self._isnan] = iNaT - return self._shallow_copy(values=values) + new_ordinals = super(PeriodArray, self)._add_delta(other) + return type(self)(new_ordinals, freq=self.freq) - def _maybe_convert_timedelta(self, other): + def _check_timedeltalike_freq_compat(self, other): """ - Convert timedelta-like input to an integer multiple of self.freq + Arithmetic operations with timedelta-like scalars or array `other` + are only valid if `other` is an integer multiple of `self.freq`. + If the operation is valid, find that integer multiple. Otherwise, + raise because the operation is invalid. Parameters ---------- - other : timedelta, np.timedelta64, DateOffset, int, np.ndarray + other : timedelta, np.timedelta64, Tick, + ndarray[timedelta64], TimedeltaArray, TimedeltaIndex Returns ------- - converted : int, np.ndarray[int64] + multiple : int or ndarray[int64] Raises ------ - IncompatibleFrequency : if the input cannot be written as a multiple - of self.freq. Note IncompatibleFrequency subclasses ValueError. + IncompatibleFrequency """ - if isinstance( - other, (timedelta, np.timedelta64, Tick, np.ndarray)): - offset = frequencies.to_offset(self.freq.rule_code) - if isinstance(offset, Tick): - if isinstance(other, np.ndarray): - nanos = np.vectorize(delta_to_nanoseconds)(other) - else: - nanos = delta_to_nanoseconds(other) - offset_nanos = delta_to_nanoseconds(offset) - check = np.all(nanos % offset_nanos == 0) - if check: - return nanos // offset_nanos - elif isinstance(other, DateOffset): - freqstr = other.rule_code - base = frequencies.get_base_alias(freqstr) - if base == self.freq.rule_code: - return other.n - msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) - raise IncompatibleFrequency(msg) - elif lib.is_integer(other): - # integer is passed to .shift via - # _add_datetimelike_methods basically - # but ufunc may pass integer to _add_delta - return other + assert isinstance(self.freq, Tick) # checked by calling function + own_offset = frequencies.to_offset(self.freq.rule_code) + base_nanos = delta_to_nanoseconds(own_offset) + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + nanos = delta_to_nanoseconds(other) + + elif isinstance(other, np.ndarray): + # numpy timedelta64 array; all entries must be compatible + assert other.dtype.kind == 'm' + if other.dtype != _TD_DTYPE: + # i.e. non-nano unit + # TODO: disallow unit-less timedelta64 + other = other.astype(_TD_DTYPE) + nanos = other.view('i8') + else: + # TimedeltaArray/Index + nanos = other.asi8 - # raise when input doesn't have freq - msg = "Input has different freq from {cls}(freq={freqstr})" - raise IncompatibleFrequency(msg.format(cls=type(self).__name__, - freqstr=self.freqstr)) + if np.all(nanos % base_nanos == 0): + # nanos being added is an integer multiple of the + # base-frequency to self.freq + delta = nanos // base_nanos + # delta is the integer (or integer-array) number of periods + # by which will be added to self. + return delta + raise IncompatibleFrequency("Input has different freq from " + "{cls}(freq={freqstr})" + .format(cls=type(self).__name__, + freqstr=self.freqstr)) + + def _values_for_argsort(self): + return self._data -PeriodArrayMixin._add_comparison_ops() -PeriodArrayMixin._add_datetimelike_methods() + +PeriodArray._add_comparison_ops() # ------------------------------------------------------------------- # Constructor Helpers +def period_array(data, freq=None, copy=False): + # type: (Sequence[Optional[Period]], Optional[Tick]) -> PeriodArray + """ + Construct a new PeriodArray from a sequence of Period scalars. + + Parameters + ---------- + data : Sequence of Period objects + A sequence of Period objects. These are required to all have + the same ``freq.`` Missing values can be indicated by ``None`` + or ``pandas.NaT``. + freq : str, Tick, or Offset + The frequency of every element of the array. This can be specified + to avoid inferring the `freq` from `data`. + copy : bool, default False + Whether to ensure a copy of the data is made. + + Returns + ------- + PeriodArray + + See Also + -------- + PeriodArray + pandas.PeriodIndex + + Examples + -------- + >>> period_array([pd.Period('2017', freq='A'), + ... pd.Period('2018', freq='A')]) + + ['2017', '2018'] + Length: 2, dtype: period[A-DEC] + + >>> period_array([pd.Period('2017', freq='A'), + ... pd.Period('2018', freq='A'), + ... pd.NaT]) + + ['2017', '2018', 'NaT'] + Length: 3, dtype: period[A-DEC] + + Integers that look like years are handled + + >>> period_array([2000, 2001, 2002], freq='D') + ['2000-01-01', '2001-01-01', '2002-01-01'] + Length: 3, dtype: period[D] + + Datetime-like strings may also be passed + + >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q') + + ['2000Q1', '2000Q2', '2000Q3', '2000Q4'] + Length: 4, dtype: period[Q-DEC] + """ + if is_datetime64_dtype(data): + return PeriodArray._from_datetime64(data, freq) + if isinstance(data, (ABCPeriodIndex, ABCSeries, PeriodArray)): + return PeriodArray(data, freq) + + # other iterable of some kind + if not isinstance(data, (np.ndarray, list, tuple)): + data = list(data) + + data = np.asarray(data) + + if freq: + dtype = PeriodDtype(freq) + else: + dtype = None + + if is_float_dtype(data) and len(data) > 0: + raise TypeError("PeriodIndex does not allow " + "floating point in construction") + + data = ensure_object(data) + + return PeriodArray._from_sequence(data, dtype=dtype) + + +def validate_dtype_freq(dtype, freq): + """ + If both a dtype and a freq are available, ensure they match. If only + dtype is available, extract the implied freq. + + Parameters + ---------- + dtype : dtype + freq : DateOffset or None + + Returns + ------- + freq : DateOffset + + Raises + ------ + ValueError : non-period dtype + IncompatibleFrequency : mismatch between dtype and freq + """ + if freq is not None: + freq = frequencies.to_offset(freq) + + if dtype is not None: + dtype = pandas_dtype(dtype) + if not is_period_dtype(dtype): + raise ValueError('dtype must be PeriodDtype') + if freq is None: + freq = dtype.freq + elif freq != dtype.freq: + raise IncompatibleFrequency('specified freq and dtype ' + 'are different') + return freq + + +def dt64arr_to_periodarr(data, freq, tz=None): + """ + Convert an datetime-like array to values Period ordinals. + + Parameters + ---------- + data : Union[Series[datetime64[ns]], DatetimeIndex, ndarray[datetime64ns]] + freq : Optional[Union[str, Tick]] + Must match the `freq` on the `data` if `data` is a DatetimeIndex + or Series. + tz : Optional[tzinfo] + + Returns + ------- + ordinals : ndarray[int] + freq : Tick + The frequencey extracted from the Series or DatetimeIndex if that's + used. + + """ + if data.dtype != np.dtype('M8[ns]'): + raise ValueError('Wrong dtype: %s' % data.dtype) + + if freq is None: + if isinstance(data, ABCIndexClass): + data, freq = data._values, data.freq + elif isinstance(data, ABCSeries): + data, freq = data._values, data.dt.freq + + freq = Period._maybe_convert_freq(freq) + + if isinstance(data, (ABCIndexClass, ABCSeries)): + data = data._values + + base, mult = frequencies.get_freq_code(freq) + return libperiod.dt64arr_to_periodarr(data.view('i8'), base, tz), freq + + def _get_ordinal_range(start, end, periods, freq, mult=1): if com.count_not_none(start, end, periods) != 2: raise ValueError('Of the three parameters: start, end, and periods, ' diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py new file mode 100644 index 0000000000000..134466d769ada --- /dev/null +++ b/pandas/core/arrays/sparse.py @@ -0,0 +1,2010 @@ +""" +SparseArray data structure +""" +from __future__ import division + +import numbers +import operator +import re +import warnings + +import numpy as np + +from pandas._libs import index as libindex, lib +import pandas._libs.sparse as splib +from pandas._libs.sparse import BlockIndex, IntIndex +from pandas._libs.tslibs import NaT +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.errors import PerformanceWarning + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import ( + astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type, + infer_dtype_from_scalar, maybe_convert_platform) +from pandas.core.dtypes.common import ( + is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, + is_integer, is_list_like, is_object_dtype, is_scalar, is_string_dtype, + pandas_dtype) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ( + ABCIndexClass, ABCSeries, ABCSparseSeries) +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna + +from pandas.core.accessor import PandasDelegate, delegate_names +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.missing import interpolate_2d + +import pandas.io.formats.printing as printing + + +# ---------------------------------------------------------------------------- +# Dtype +@register_extension_dtype +class SparseDtype(ExtensionDtype): + """ + Dtype for data stored in :class:`SparseArray`. + + This dtype implements the pandas ExtensionDtype interface. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 + The dtype of the underlying array storing the non-fill value values. + fill_value : scalar, optional. + The scalar value not stored in the SparseArray. By default, this + depends on `dtype`. + + ========== ========== + dtype na_value + ========== ========== + float ``np.nan`` + int ``0`` + bool ``False`` + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + ========== ========== + + The default value may be overridden by specifying a `fill_value`. + """ + # We include `_is_na_fill_value` in the metadata to avoid hash collisions + # between SparseDtype(float, 0.0) and SparseDtype(float, nan). + # Without is_na_fill_value in the comparison, those would be equal since + # hash(nan) is (sometimes?) 0. + _metadata = ('_dtype', '_fill_value', '_is_na_fill_value') + + def __init__(self, dtype=np.float64, fill_value=None): + # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None + from pandas.core.dtypes.missing import na_value_for_dtype + from pandas.core.dtypes.common import ( + pandas_dtype, is_string_dtype, is_scalar + ) + + if isinstance(dtype, type(self)): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + dtype = pandas_dtype(dtype) + if is_string_dtype(dtype): + dtype = np.dtype('object') + + if fill_value is None: + fill_value = na_value_for_dtype(dtype) + + if not is_scalar(fill_value): + raise ValueError("fill_value must be a scalar. Got {} " + "instead".format(fill_value)) + self._dtype = dtype + self._fill_value = fill_value + + def __hash__(self): + # Python3 doesn't inherit __hash__ when a base class overrides + # __eq__, so we explicitly do it here. + return super(SparseDtype, self).__hash__() + + def __eq__(self, other): + # We have to override __eq__ to handle NA values in _metadata. + # The base class does simple == checks, which fail for NA. + if isinstance(other, compat.string_types): + try: + other = self.construct_from_string(other) + except TypeError: + return False + + if isinstance(other, type(self)): + subtype = self.subtype == other.subtype + if self._is_na_fill_value: + # this case is complicated by two things: + # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) + # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) + # i.e. we want to treat any floating-point NaN as equal, but + # not a floating-point NaN and a datetime NaT. + fill_value = ( + other._is_na_fill_value and + isinstance(self.fill_value, type(other.fill_value)) or + isinstance(other.fill_value, type(self.fill_value)) + ) + else: + fill_value = self.fill_value == other.fill_value + + return subtype and fill_value + return False + + @property + def fill_value(self): + """ + The fill value of the array. + + Converting the SparseArray to a dense ndarray will fill the + array with this value. + + .. warning:: + + It's possible to end up with a SparseArray that has ``fill_value`` + values in ``sp_values``. This can occur, for example, when setting + ``SparseArray.fill_value`` directly. + """ + return self._fill_value + + @property + def _is_na_fill_value(self): + from pandas.core.dtypes.missing import isna + return isna(self.fill_value) + + @property + def _is_numeric(self): + from pandas.core.dtypes.common import is_object_dtype + return not is_object_dtype(self.subtype) + + @property + def _is_boolean(self): + from pandas.core.dtypes.common import is_bool_dtype + return is_bool_dtype(self.subtype) + + @property + def kind(self): + """ + The sparse kind. Either 'integer', or 'block'. + """ + return self.subtype.kind + + @property + def type(self): + return self.subtype.type + + @property + def subtype(self): + return self._dtype + + @property + def name(self): + return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value) + + def __repr__(self): + return self.name + + @classmethod + def construct_array_type(cls): + return SparseArray + + @classmethod + def construct_from_string(cls, string): + """ + Construct a SparseDtype from a string form. + + Parameters + ---------- + string : str + Can take the following forms. + + string dtype + ================ ============================ + 'int' SparseDtype[np.int64, 0] + 'Sparse' SparseDtype[np.float64, nan] + 'Sparse[int]' SparseDtype[np.int64, 0] + 'Sparse[int, 0]' SparseDtype[np.int64, 0] + ================ ============================ + + It is not possible to specify non-default fill values + with a string. An argument like ``'Sparse[int, 1]'`` + will raise a ``TypeError`` because the default fill value + for integers is 0. + + Returns + ------- + SparseDtype + """ + msg = "Could not construct SparseDtype from '{}'".format(string) + if string.startswith("Sparse"): + try: + sub_type, has_fill_value = cls._parse_subtype(string) + result = SparseDtype(sub_type) + except Exception: + raise TypeError(msg) + else: + msg = ("Could not construct SparseDtype from '{}'.\n\nIt " + "looks like the fill_value in the string is not " + "the default for the dtype. Non-default fill_values " + "are not supported. Use the 'SparseDtype()' " + "constructor instead.") + if has_fill_value and str(result) != string: + raise TypeError(msg.format(string)) + return result + else: + raise TypeError(msg) + + @staticmethod + def _parse_subtype(dtype): + """ + Parse a string to get the subtype + + Parameters + ---------- + dtype : str + A string like + + * Sparse[subtype] + * Sparse[subtype, fill_value] + + Returns + ------- + subtype : str + + Raises + ------ + ValueError + When the subtype cannot be extracted. + """ + xpr = re.compile( + r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$" + ) + m = xpr.match(dtype) + has_fill_value = False + if m: + subtype = m.groupdict()['subtype'] + has_fill_value = m.groupdict()['fill_value'] or has_fill_value + elif dtype == "Sparse": + subtype = 'float64' + else: + raise ValueError("Cannot parse {}".format(dtype)) + return subtype, has_fill_value + + @classmethod + def is_dtype(cls, dtype): + dtype = getattr(dtype, 'dtype', dtype) + if (isinstance(dtype, compat.string_types) and + dtype.startswith("Sparse")): + sub_type, _ = cls._parse_subtype(dtype) + dtype = np.dtype(sub_type) + elif isinstance(dtype, cls): + return True + return isinstance(dtype, np.dtype) or dtype == 'Sparse' + + def update_dtype(self, dtype): + """ + Convert the SparseDtype to a new dtype. + + This takes care of converting the ``fill_value``. + + Parameters + ---------- + dtype : Union[str, numpy.dtype, SparseDtype] + The new dtype to use. + + * For a SparseDtype, it is simply returned + * For a NumPy dtype (or str), the current fill value + is converted to the new dtype, and a SparseDtype + with `dtype` and the new fill value is returned. + + Returns + ------- + SparseDtype + A new SparseDtype with the corret `dtype` and fill value + for that `dtype`. + + Raises + ------ + ValueError + When the current fill value cannot be converted to the + new `dtype` (e.g. trying to convert ``np.nan`` to an + integer dtype). + + + Examples + -------- + >>> SparseDtype(int, 0).update_dtype(float) + Sparse[float64, 0.0] + + >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) + Sparse[float64, nan] + """ + cls = type(self) + dtype = pandas_dtype(dtype) + + if not isinstance(dtype, cls): + fill_value = astype_nansafe(np.array(self.fill_value), + dtype).item() + dtype = cls(dtype, fill_value=fill_value) + + return dtype + + @property + def _subtype_with_str(self): + """ + Whether the SparseDtype's subtype should be considered ``str``. + + Typically, pandas will store string data in an object-dtype array. + When converting values to a dtype, e.g. in ``.astype``, we need to + be more specific, we need the actual underlying type. + + Returns + ------- + + >>> SparseDtype(int, 1)._subtype_with_str + dtype('int64') + + >>> SparseDtype(object, 1)._subtype_with_str + dtype('O') + + >>> dtype = SparseDtype(str, '') + >>> dtype.subtype + dtype('O') + + >>> dtype._subtype_with_str + str + """ + if isinstance(self.fill_value, compat.string_types): + return type(self.fill_value) + return self.subtype + + +# ---------------------------------------------------------------------------- +# Array + + +_sparray_doc_kwargs = dict(klass='SparseArray') + + +def _get_fill(arr): + # type: (SparseArray) -> ndarray + """ + Create a 0-dim ndarray containing the fill value + + Parameters + ---------- + arr : SparseArray + + Returns + ------- + fill_value : ndarray + 0-dim ndarray with just the fill value. + + Notes + ----- + coerce fill_value to arr dtype if possible + int64 SparseArray can have NaN as fill_value if there is no missing + """ + try: + return np.asarray(arr.fill_value, dtype=arr.dtype.subtype) + except ValueError: + return np.asarray(arr.fill_value) + + +def _sparse_array_op(left, right, op, name): + """ + Perform a binary operation between two arrays. + + Parameters + ---------- + left : Union[SparseArray, ndarray] + right : Union[SparseArray, ndarray] + op : Callable + The binary operation to perform + name str + Name of the callable. + + Returns + ------- + SparseArray + """ + # type: (SparseArray, SparseArray, Callable, str) -> Any + if name.startswith('__'): + # For lookups in _libs.sparse we need non-dunder op name + name = name[2:-2] + + # dtype used to find corresponding sparse method + ltype = left.dtype.subtype + rtype = right.dtype.subtype + + if not is_dtype_equal(ltype, rtype): + subtype = find_common_type([ltype, rtype]) + ltype = SparseDtype(subtype, left.fill_value) + rtype = SparseDtype(subtype, right.fill_value) + + # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe + left = left.astype(ltype) + right = right.astype(rtype) + dtype = ltype.subtype + else: + dtype = ltype + + # dtype the result must have + result_dtype = None + + if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: + with np.errstate(all='ignore'): + result = op(left.get_values(), right.get_values()) + fill = op(_get_fill(left), _get_fill(right)) + + if left.sp_index.ngaps == 0: + index = left.sp_index + else: + index = right.sp_index + elif left.sp_index.equals(right.sp_index): + with np.errstate(all='ignore'): + result = op(left.sp_values, right.sp_values) + fill = op(_get_fill(left), _get_fill(right)) + index = left.sp_index + else: + if name[0] == 'r': + left, right = right, left + name = name[1:] + + if name in ('and', 'or') and dtype == 'bool': + opname = 'sparse_{name}_uint8'.format(name=name) + # to make template simple, cast here + left_sp_values = left.sp_values.view(np.uint8) + right_sp_values = right.sp_values.view(np.uint8) + result_dtype = np.bool + else: + opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) + left_sp_values = left.sp_values + right_sp_values = right.sp_values + + sparse_op = getattr(splib, opname) + + with np.errstate(all='ignore'): + result, index, fill = sparse_op( + left_sp_values, left.sp_index, left.fill_value, + right_sp_values, right.sp_index, right.fill_value) + + if result_dtype is None: + result_dtype = result.dtype + + return _wrap_result(name, result, index, fill, dtype=result_dtype) + + +def _wrap_result(name, data, sparse_index, fill_value, dtype=None): + """ + wrap op result to have correct dtype + """ + if name.startswith('__'): + # e.g. __eq__ --> eq + name = name[2:-2] + + if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + dtype = np.bool + + fill_value = lib.item_from_zerodim(fill_value) + + if is_bool_dtype(dtype): + # fill_value may be np.bool_ + fill_value = bool(fill_value) + return SparseArray(data, + sparse_index=sparse_index, + fill_value=fill_value, + dtype=dtype) + + +class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): + """ + An ExtensionArray for storing sparse data. + + .. versionchanged:: 0.24.0 + + Implements the ExtensionArray interface. + + Parameters + ---------- + data : array-like + A dense array of values to store in the SparseArray. This may contain + `fill_value`. + sparse_index : SparseIndex, optional + index : Index + fill_value : scalar, optional + Elements in `data` that are `fill_value` are not stored in the + SparseArray. For memory savings, this should be the most common value + in `data`. By default, `fill_value` depends on the dtype of `data`: + + =========== ========== + data.dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool False + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The fill value is potentiall specified in three ways. In order of + precedence, these are + + 1. The `fill_value` argument + 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is + a ``SparseDtype`` + 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` + is not a ``SparseDtype`` and `data` is a ``SparseArray``. + + + kind : {'integer', 'block'}, default 'integer' + The type of storage for sparse locations. + + * 'block': Stores a `block` and `block_length` for each + contiguous *span* of sparse values. This is best when + sparse data tends to be clumped together, with large + regsions of ``fill-value`` values between sparse values. + * 'integer': uses an integer to store the location of + each sparse value. + + dtype : np.dtype or SparseDtype, optional + The dtype to use for the SparseArray. For numpy dtypes, this + determines the dtype of ``self.sp_values``. For SparseDtype, + this determines ``self.sp_values`` and ``self.fill_value``. + copy : bool, default False + Whether to explicitly copy the incoming `data` array. + """ + + __array_priority__ = 15 + _pandas_ftype = 'sparse' + _subtyp = 'sparse_array' # register ABCSparseArray + + def __init__(self, data, sparse_index=None, index=None, fill_value=None, + kind='integer', dtype=None, copy=False): + from pandas.core.internals import SingleBlockManager + + if isinstance(data, SingleBlockManager): + data = data.internal_values() + + if fill_value is None and isinstance(dtype, SparseDtype): + fill_value = dtype.fill_value + + if isinstance(data, (type(self), ABCSparseSeries)): + # disable normal inference on dtype, sparse_index, & fill_value + if sparse_index is None: + sparse_index = data.sp_index + if fill_value is None: + fill_value = data.fill_value + if dtype is None: + dtype = data.dtype + # TODO: make kind=None, and use data.kind? + data = data.sp_values + + # Handle use-provided dtype + if isinstance(dtype, compat.string_types): + # Two options: dtype='int', regular numpy dtype + # or dtype='Sparse[int]', a sparse dtype + try: + dtype = SparseDtype.construct_from_string(dtype) + except TypeError: + dtype = pandas_dtype(dtype) + + if isinstance(dtype, SparseDtype): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + if index is not None and not is_scalar(data): + raise Exception("must only pass scalars with an index ") + + if is_scalar(data): + if index is not None: + if data is None: + data = np.nan + + if index is not None: + npoints = len(index) + elif sparse_index is None: + npoints = 1 + else: + npoints = sparse_index.length + + dtype = infer_dtype_from_scalar(data)[0] + data = construct_1d_arraylike_from_scalar( + data, npoints, dtype + ) + + if dtype is not None: + dtype = pandas_dtype(dtype) + + # TODO: disentangle the fill_value dtype inference from + # dtype inference + if data is None: + # XXX: What should the empty dtype be? Object or float? + data = np.array([], dtype=dtype) + + if not is_array_like(data): + try: + # probably shared code in sanitize_series + from pandas.core.internals.construction import sanitize_array + data = sanitize_array(data, index=None) + except ValueError: + # NumPy may raise a ValueError on data like [1, []] + # we retry with object dtype here. + if dtype is None: + dtype = object + data = np.atleast_1d(np.asarray(data, dtype=dtype)) + else: + raise + + if copy: + # TODO: avoid double copy when dtype forces cast. + data = data.copy() + + if fill_value is None: + fill_value_dtype = data.dtype if dtype is None else dtype + if fill_value_dtype is None: + fill_value = np.nan + else: + fill_value = na_value_for_dtype(fill_value_dtype) + + if isinstance(data, type(self)) and sparse_index is None: + sparse_index = data._sparse_index + sparse_values = np.asarray(data.sp_values, dtype=dtype) + elif sparse_index is None: + sparse_values, sparse_index, fill_value = make_sparse( + data, kind=kind, fill_value=fill_value, dtype=dtype + ) + else: + sparse_values = np.asarray(data, dtype=dtype) + if len(sparse_values) != sparse_index.npoints: + raise AssertionError("Non array-like type {type} must " + "have the same length as the index" + .format(type=type(sparse_values))) + self._sparse_index = sparse_index + self._sparse_values = sparse_values + self._dtype = SparseDtype(sparse_values.dtype, fill_value) + + @classmethod + def _simple_new(cls, sparse_array, sparse_index, dtype): + # type: (np.ndarray, SparseIndex, SparseDtype) -> 'SparseArray' + new = cls([]) + new._sparse_index = sparse_index + new._sparse_values = sparse_array + new._dtype = dtype + return new + + def __array__(self, dtype=None, copy=True): + fill_value = self.fill_value + + if self.sp_index.ngaps == 0: + # Compat for na dtype and int values. + return self.sp_values + if dtype is None: + # Can NumPy represent this type? + # If not, `np.result_type` will raise. We catch that + # and return object. + if is_datetime64_any_dtype(self.sp_values.dtype): + # However, we *do* special-case the common case of + # a datetime64 with pandas NaT. + if fill_value is NaT: + # Can't put pd.NaT in a datetime64[ns] + fill_value = np.datetime64('NaT') + try: + dtype = np.result_type(self.sp_values.dtype, type(fill_value)) + except TypeError: + dtype = object + + out = np.full(self.shape, fill_value, dtype=dtype) + out[self.sp_index.to_int_index().indices] = self.sp_values + return out + + def __setitem__(self, key, value): + # I suppose we could allow setting of non-fill_value elements. + msg = "SparseArray does not support item assignment via setitem" + raise TypeError(msg) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars, dtype=dtype) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, dtype=original.dtype) + + # ------------------------------------------------------------------------ + # Data + # ------------------------------------------------------------------------ + @property + def sp_index(self): + """ + The SparseIndex containing the location of non- ``fill_value`` points. + """ + return self._sparse_index + + @property + def sp_values(self): + """ + An ndarray containing the non- ``fill_value`` values. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) + >>> s.sp_values + array([1, 2]) + """ + return self._sparse_values + + @property + def dtype(self): + return self._dtype + + @property + def fill_value(self): + """ + Elements in `data` that are `fill_value` are not stored. + + For memory savings, this should be the most common value in the array. + """ + return self.dtype.fill_value + + @fill_value.setter + def fill_value(self, value): + self._dtype = SparseDtype(self.dtype.subtype, value) + + @property + def kind(self): + """ + The kind of sparse index for this array. One of {'integer', 'block'}. + """ + if isinstance(self.sp_index, IntIndex): + return 'integer' + else: + return 'block' + + @property + def _valid_sp_values(self): + sp_vals = self.sp_values + mask = notna(sp_vals) + return sp_vals[mask] + + def __len__(self): + return self.sp_index.length + + @property + def _null_fill_value(self): + return self._dtype._is_na_fill_value + + def _fill_value_matches(self, fill_value): + if self._null_fill_value: + return isna(fill_value) + else: + return self.fill_value == fill_value + + @property + def nbytes(self): + return self.sp_values.nbytes + self.sp_index.nbytes + + @property + def density(self): + """ + The percent of non- ``fill_value`` points, as decimal. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.density + 0.6 + """ + r = float(self.sp_index.npoints) / float(self.sp_index.length) + return r + + @property + def npoints(self): + """ + The number of non- ``fill_value`` points. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.npoints + 3 + """ + return self.sp_index.npoints + + @property + def values(self): + """ + Dense values + """ + return self.to_dense() + + def isna(self): + from pandas import isna + # If null fill value, we want SparseDtype[bool, true] + # to preserve the same memory usage. + dtype = SparseDtype(bool, self._null_fill_value) + return type(self)._simple_new(isna(self.sp_values), + self.sp_index, dtype) + + def fillna(self, value=None, method=None, limit=None): + """ + Fill missing values with `value`. + + Parameters + ---------- + value : scalar, optional + method : str, optional + + .. warning:: + + Using 'method' will result in high memory use, + as all `fill_value` methods will be converted to + an in-memory ndarray + + limit : int, optional + + Returns + ------- + SparseArray + + Notes + ----- + When `value` is specified, the result's ``fill_value`` depends on + ``self.fill_value``. The goal is to maintain low-memory use. + + If ``self.fill_value`` is NA, the result dtype will be + ``SparseDtype(self.dtype, fill_value=value)``. This will preserve + amount of memory used before and after filling. + + When ``self.fill_value`` is not NA, the result dtype will be + ``self.dtype``. Again, this preserves the amount of memory used. + """ + if ((method is None and value is None) or + (method is not None and value is not None)): + raise ValueError("Must specify one of 'method' or 'value'.") + + elif method is not None: + msg = "fillna with 'method' requires high memory usage." + warnings.warn(msg, PerformanceWarning) + filled = interpolate_2d(np.asarray(self), method=method, + limit=limit) + return type(self)(filled, fill_value=self.fill_value) + + else: + new_values = np.where(isna(self.sp_values), value, self.sp_values) + + if self._null_fill_value: + # This is essentially just updating the dtype. + new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) + else: + new_dtype = self.dtype + + return self._simple_new(new_values, self._sparse_index, new_dtype) + + def shift(self, periods=1): + + if periods == 0: + return self.copy() + + subtype = np.result_type(np.nan, self.dtype.subtype) + + if subtype != self.dtype.subtype: + # just coerce up front + arr = self.astype(SparseDtype(subtype, self.fill_value)) + else: + arr = self + + empty = self._from_sequence([self.dtype.na_value] * abs(periods), + dtype=arr.dtype) + if periods > 0: + a = empty + b = arr[:-periods] + else: + a = arr[abs(periods):] + b = empty + return arr._concat_same_type([a, b]) + + def _first_fill_value_loc(self): + """ + Get the location of the first missing value. + + Returns + ------- + int + """ + if len(self) == 0 or self.sp_index.npoints == len(self): + return -1 + + indices = self.sp_index.to_int_index().indices + if not len(indices) or indices[0] > 0: + return 0 + + diff = indices[1:] - indices[:-1] + return np.searchsorted(diff, 2) + 1 + + def unique(self): + uniques = list(algos.unique(self.sp_values)) + fill_loc = self._first_fill_value_loc() + if fill_loc >= 0: + uniques.insert(fill_loc, self.fill_value) + return type(self)._from_sequence(uniques, dtype=self.dtype) + + def _values_for_factorize(self): + # Still override this for hash_pandas_object + return np.asarray(self), self.fill_value + + def factorize(self, na_sentinel=-1): + # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] + # The sparsity on this is backwards from what Sparse would want. Want + # ExtensionArray.factorize -> Tuple[EA, EA] + # Given that we have to return a dense array of labels, why bother + # implementing an efficient factorize? + labels, uniques = algos.factorize(np.asarray(self), + na_sentinel=na_sentinel) + uniques = SparseArray(uniques, dtype=self.dtype) + return labels, uniques + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of unique values. + + Parameters + ---------- + dropna : boolean, default True + Don't include counts of NaN, even if NaN is in sp_values. + + Returns + ------- + counts : Series + """ + from pandas import Index, Series + + keys, counts = algos._value_counts_arraylike(self.sp_values, + dropna=dropna) + fcounts = self.sp_index.ngaps + if fcounts > 0: + if self._null_fill_value and dropna: + pass + else: + if self._null_fill_value: + mask = isna(keys) + else: + mask = keys == self.fill_value + + if mask.any(): + counts[mask] += fcounts + else: + keys = np.insert(keys, 0, self.fill_value) + counts = np.insert(counts, 0, fcounts) + + if not isinstance(keys, ABCIndexClass): + keys = Index(keys) + result = Series(counts, index=keys) + return result + + # -------- + # Indexing + # -------- + + def __getitem__(self, key): + if isinstance(key, tuple): + if len(key) > 1: + raise IndexError("too many indices for array.") + key = key[0] + + if is_integer(key): + return self._get_val_at(key) + elif isinstance(key, tuple): + data_slice = self.values[key] + elif isinstance(key, slice): + # special case to preserve dtypes + if key == slice(None): + return self.copy() + # TODO: this logic is surely elsewhere + # TODO: this could be more efficient + indices = np.arange(len(self), dtype=np.int32)[key] + return self.take(indices) + else: + # TODO: I think we can avoid densifying when masking a + # boolean SparseArray with another. Need to look at the + # key's fill_value for True / False, and then do an intersection + # on the indicies of the sp_values. + if isinstance(key, SparseArray): + if is_bool_dtype(key): + key = key.to_dense() + else: + key = np.asarray(key) + + if com.is_bool_indexer(key) and len(self) == len(key): + return self.take(np.arange(len(key), dtype=np.int32)[key]) + elif hasattr(key, '__len__'): + return self.take(key) + else: + raise ValueError("Cannot slice with '{}'".format(key)) + + return type(self)(data_slice, kind=self.kind) + + def _get_val_at(self, loc): + n = len(self) + if loc < 0: + loc += n + + if loc >= n or loc < 0: + raise IndexError('Out of bounds access') + + sp_loc = self.sp_index.lookup(loc) + if sp_loc == -1: + return self.fill_value + else: + return libindex.get_value_at(self.sp_values, sp_loc) + + def take(self, indices, allow_fill=False, fill_value=None): + if is_scalar(indices): + raise ValueError("'indices' must be an array, not a " + "scalar '{}'.".format(indices)) + indices = np.asarray(indices, dtype=np.int32) + + if indices.size == 0: + result = [] + kwargs = {'dtype': self.dtype} + elif allow_fill: + result = self._take_with_fill(indices, fill_value=fill_value) + kwargs = {} + else: + result = self._take_without_fill(indices) + kwargs = {'dtype': self.dtype} + + return type(self)(result, fill_value=self.fill_value, kind=self.kind, + **kwargs) + + def _take_with_fill(self, indices, fill_value=None): + if fill_value is None: + fill_value = self.dtype.na_value + + if indices.min() < -1: + raise ValueError("Invalid value in 'indices'. Must be between -1 " + "and the length of the array.") + + if indices.max() >= len(self): + raise IndexError("out of bounds value in 'indices'.") + + if len(self) == 0: + # Empty... Allow taking only if all empty + if (indices == -1).all(): + dtype = np.result_type(self.sp_values, type(fill_value)) + taken = np.empty_like(indices, dtype=dtype) + taken.fill(fill_value) + return taken + else: + raise IndexError('cannot do a non-empty take from an empty ' + 'axes.') + + sp_indexer = self.sp_index.lookup_array(indices) + + if self.sp_index.npoints == 0: + # Avoid taking from the empty self.sp_values + taken = np.full(sp_indexer.shape, fill_value=fill_value, + dtype=np.result_type(type(fill_value))) + else: + taken = self.sp_values.take(sp_indexer) + + # sp_indexer may be -1 for two reasons + # 1.) we took for an index of -1 (new) + # 2.) we took a value that was self.fill_value (old) + new_fill_indices = indices == -1 + old_fill_indices = (sp_indexer == -1) & ~new_fill_indices + + # Fill in two steps. + # Old fill values + # New fill values + # potentially coercing to a new dtype at each stage. + + m0 = sp_indexer[old_fill_indices] < 0 + m1 = sp_indexer[new_fill_indices] < 0 + + result_type = taken.dtype + + if m0.any(): + result_type = np.result_type(result_type, + type(self.fill_value)) + taken = taken.astype(result_type) + taken[old_fill_indices] = self.fill_value + + if m1.any(): + result_type = np.result_type(result_type, type(fill_value)) + taken = taken.astype(result_type) + taken[new_fill_indices] = fill_value + + return taken + + def _take_without_fill(self, indices): + to_shift = indices < 0 + indices = indices.copy() + + n = len(self) + + if (indices.max() >= n) or (indices.min() < -n): + if n == 0: + raise IndexError("cannot do a non-empty take from an " + "empty axes.") + else: + raise IndexError("out of bounds value in 'indices'.") + + if to_shift.any(): + indices[to_shift] += n + + if self.sp_index.npoints == 0: + # edge case in take... + # I think just return + out = np.full(indices.shape, self.fill_value, + dtype=np.result_type(type(self.fill_value))) + arr, sp_index, fill_value = make_sparse(out, + fill_value=self.fill_value) + return type(self)(arr, sparse_index=sp_index, + fill_value=fill_value) + + sp_indexer = self.sp_index.lookup_array(indices) + taken = self.sp_values.take(sp_indexer) + fillable = (sp_indexer < 0) + + if fillable.any(): + # TODO: may need to coerce array to fill value + result_type = np.result_type(taken, type(self.fill_value)) + taken = taken.astype(result_type) + taken[fillable] = self.fill_value + + return taken + + def copy(self, deep=False): + if deep: + values = self.sp_values.copy() + else: + values = self.sp_values + + return self._simple_new(values, self.sp_index, self.dtype) + + @classmethod + def _concat_same_type(cls, to_concat): + fill_values = [x.fill_value for x in to_concat] + + fill_value = fill_values[0] + + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore tha all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): + warnings.warn("Concatenating sparse arrays with multiple fill " + "values: '{}'. Picking the first and " + "converting the rest.".format(fill_values), + PerformanceWarning, + stacklevel=6) + keep = to_concat[0] + to_concat2 = [keep] + + for arr in to_concat[1:]: + to_concat2.append(cls(np.asarray(arr), fill_value=fill_value)) + + to_concat = to_concat2 + + values = [] + length = 0 + + if to_concat: + sp_kind = to_concat[0].kind + else: + sp_kind = 'integer' + + if sp_kind == 'integer': + indices = [] + + for arr in to_concat: + idx = arr.sp_index.to_int_index().indices.copy() + idx += length # TODO: wraparound + length += arr.sp_index.length + + values.append(arr.sp_values) + indices.append(idx) + + data = np.concatenate(values) + indices = np.concatenate(indices) + sp_index = IntIndex(length, indices) + + else: + # when concatentating block indices, we don't claim that you'll + # get an identical index as concating the values and then + # creating a new index. We don't want to spend the time trying + # to merge blocks across arrays in `to_concat`, so the resulting + # BlockIndex may have more blocs. + blengths = [] + blocs = [] + + for arr in to_concat: + idx = arr.sp_index.to_block_index() + + values.append(arr.sp_values) + blocs.append(idx.blocs.copy() + length) + blengths.append(idx.blengths) + length += arr.sp_index.length + + data = np.concatenate(values) + blocs = np.concatenate(blocs) + blengths = np.concatenate(blengths) + + sp_index = BlockIndex(length, blocs, blengths) + + return cls(data, sparse_index=sp_index, fill_value=fill_value) + + def astype(self, dtype=None, copy=True): + """ + Change the dtype of a SparseArray. + + The output will always be a SparseArray. To convert to a dense + ndarray with a certain dtype, use :meth:`numpy.asarray`. + + Parameters + ---------- + dtype : np.dtype or ExtensionDtype + For SparseDtype, this changes the dtype of + ``self.sp_values`` and the ``self.fill_value``. + + For other dtypes, this only changes the dtype of + ``self.sp_values``. + + copy : bool, default True + Whether to ensure a copy is made, even if not necessary. + + Returns + ------- + SparseArray + + Examples + -------- + >>> arr = SparseArray([0, 0, 1, 2]) + >>> arr + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + >>> arr.astype(np.dtype('int32')) + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + Using a NumPy dtype with a different kind (e.g. float) will coerce + just ``self.sp_values``. + + >>> arr.astype(np.dtype('float64')) + ... # doctest: +NORMALIZE_WHITESPACE + [0, 0, 1.0, 2.0] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + Use a SparseDtype if you wish to be change the fill value as well. + + >>> arr.astype(SparseDtype("float64", fill_value=np.nan)) + ... # doctest: +NORMALIZE_WHITESPACE + [nan, nan, 1.0, 2.0] + Fill: nan + IntIndex + Indices: array([2, 3], dtype=int32) + """ + dtype = self.dtype.update_dtype(dtype) + subtype = dtype._subtype_with_str + sp_values = astype_nansafe(self.sp_values, + subtype, + copy=copy) + if sp_values is self.sp_values and copy: + sp_values = sp_values.copy() + + return self._simple_new(sp_values, + self.sp_index, + dtype) + + def map(self, mapper): + """ + Map categories using input correspondence (dict, Series, or function). + + Parameters + ---------- + mapper : dict, Series, callable + The correspondence from old values to new. + + Returns + ------- + SparseArray + The output array will have the same density as the input. + The output fill value will be the result of applying the + mapping to ``self.fill_value`` + + Examples + -------- + >>> arr = pd.SparseArray([0, 1, 2]) + >>> arr.apply(lambda x: x + 10) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + + >>> arr.apply({0: 10, 1: 11, 2: 12}) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + + >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2])) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + """ + # this is used in apply. + # We get hit since we're an "is_extension_type" but regular extension + # types are not hit. This may be worth adding to the interface. + if isinstance(mapper, ABCSeries): + mapper = mapper.to_dict() + + if isinstance(mapper, compat.Mapping): + fill_value = mapper.get(self.fill_value, self.fill_value) + sp_values = [mapper.get(x, None) for x in self.sp_values] + else: + fill_value = mapper(self.fill_value) + sp_values = [mapper(x) for x in self.sp_values] + + return type(self)(sp_values, sparse_index=self.sp_index, + fill_value=fill_value) + + def to_dense(self): + """ + Convert SparseArray to a NumPy array. + + Returns + ------- + arr : NumPy array + """ + return np.asarray(self, dtype=self.sp_values.dtype) + + # TODO: Look into deprecating this in favor of `to_dense`. + get_values = to_dense + + # ------------------------------------------------------------------------ + # IO + # ------------------------------------------------------------------------ + def __setstate__(self, state): + """Necessary for making this object picklable""" + if isinstance(state, tuple): + # Compat for pandas < 0.24.0 + nd_state, (fill_value, sp_index) = state + sparse_values = np.array([]) + sparse_values.__setstate__(nd_state) + + self._sparse_values = sparse_values + self._sparse_index = sp_index + self._dtype = SparseDtype(sparse_values.dtype, fill_value) + else: + self.__dict__.update(state) + + def nonzero(self): + if self.fill_value == 0: + return self.sp_index.to_int_index().indices, + else: + return self.sp_index.to_int_index().indices[self.sp_values != 0], + + # ------------------------------------------------------------------------ + # Reductions + # ------------------------------------------------------------------------ + + def _reduce(self, name, skipna=True, **kwargs): + method = getattr(self, name, None) + + if method is None: + raise TypeError("cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype)) + + if skipna: + arr = self + else: + arr = self.dropna() + + # we don't support these kwargs. + # They should only be present when called via pandas, so do it here. + # instead of in `any` / `all` (which will raise if they're present, + # thanks to nv.validate + kwargs.pop('filter_type', None) + kwargs.pop('numeric_only', None) + kwargs.pop('op', None) + return getattr(arr, name)(**kwargs) + + def all(self, axis=None, *args, **kwargs): + """ + Tests whether all elements evaluate True + + Returns + ------- + all : bool + + See Also + -------- + numpy.all + """ + nv.validate_all(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and not np.all(self.fill_value): + return False + + return values.all() + + def any(self, axis=0, *args, **kwargs): + """ + Tests whether at least one of elements evaluate True + + Returns + ------- + any : bool + + See Also + -------- + numpy.any + """ + nv.validate_any(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and np.any(self.fill_value): + return True + + return values.any().item() + + def sum(self, axis=0, *args, **kwargs): + """ + Sum of non-NA/null values + + Returns + ------- + sum : float + """ + nv.validate_sum(args, kwargs) + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + if self._null_fill_value: + return sp_sum + else: + nsparse = self.sp_index.ngaps + return sp_sum + self.fill_value * nsparse + + def cumsum(self, axis=0, *args, **kwargs): + """ + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseArray will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : int or None + Axis over which to perform the cumulative summation. If None, + perform cumulative summation over flattened array. + + Returns + ------- + cumsum : SparseArray + """ + nv.validate_cumsum(args, kwargs) + + if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. + raise ValueError("axis(={axis}) out of bounds".format(axis=axis)) + + if not self._null_fill_value: + return SparseArray(self.to_dense()).cumsum() + + return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, + fill_value=self.fill_value) + + def mean(self, axis=0, *args, **kwargs): + """ + Mean of non-NA/null values + + Returns + ------- + mean : float + """ + nv.validate_mean(args, kwargs) + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + ct = len(valid_vals) + + if self._null_fill_value: + return sp_sum / ct + else: + nsparse = self.sp_index.ngaps + return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) + + def transpose(self, *axes): + """ + Returns the SparseArray. + """ + return self + + @property + def T(self): + """ + Returns the SparseArray. + """ + return self + + # ------------------------------------------------------------------------ + # Ufuncs + # ------------------------------------------------------------------------ + + def __array_wrap__(self, array, context=None): + from pandas.core.dtypes.generic import ABCSparseSeries + + ufunc, inputs, _ = context + inputs = tuple(x.values if isinstance(x, ABCSparseSeries) else x + for x in inputs) + return self.__array_ufunc__(ufunc, '__call__', *inputs) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + out = kwargs.get('out', ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): + return NotImplemented + + special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv', + 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder'} + if compat.PY2: + special.add('div') + aliases = { + 'subtract': 'sub', + 'multiply': 'mul', + 'floor_divide': 'floordiv', + 'true_divide': 'truediv', + 'power': 'pow', + 'remainder': 'mod', + 'divide': 'div', + 'equal': 'eq', + 'not_equal': 'ne', + 'less': 'lt', + 'less_equal': 'le', + 'greater': 'gt', + 'greater_equal': 'ge', + } + + flipped = { + 'lt': '__gt__', + 'le': '__ge__', + 'gt': '__lt__', + 'ge': '__le__', + 'eq': '__eq__', + 'ne': '__ne__', + } + + op_name = ufunc.__name__ + op_name = aliases.get(op_name, op_name) + + if op_name in special and kwargs.get('out') is None: + if isinstance(inputs[0], type(self)): + return getattr(self, '__{}__'.format(op_name))(inputs[1]) + else: + name = flipped.get(op_name, '__r{}__'.format(op_name)) + return getattr(self, name)(inputs[0]) + + if len(inputs) == 1: + # No alignment necessary. + sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) + fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) + return self._simple_new(sp_values, + self.sp_index, + SparseDtype(sp_values.dtype, fill_value)) + + result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], + **kwargs) + if out: + if len(out) == 1: + out = out[0] + return out + + if type(result) is tuple: + return tuple(type(self)(x) for x in result) + elif method == 'at': + # no return value + return None + else: + return type(self)(result) + + def __abs__(self): + return np.abs(self) + + # ------------------------------------------------------------------------ + # Ops + # ------------------------------------------------------------------------ + + @classmethod + def _create_unary_method(cls, op): + def sparse_unary_method(self): + fill_value = op(np.array(self.fill_value)).item() + values = op(self.sp_values) + dtype = SparseDtype(values.dtype, fill_value) + return cls._simple_new(values, self.sp_index, dtype) + + name = '__{name}__'.format(name=op.__name__) + return compat.set_function_name(sparse_unary_method, name, cls) + + @classmethod + def _create_arithmetic_method(cls, op): + def sparse_arithmetic_method(self, other): + op_name = op.__name__ + + if isinstance(other, (ABCSeries, ABCIndexClass)): + # Rely on pandas to dispatch to us. + return NotImplemented + + if isinstance(other, SparseArray): + return _sparse_array_op(self, other, op, op_name) + + elif is_scalar(other): + with np.errstate(all='ignore'): + fill = op(_get_fill(self), np.asarray(other)) + result = op(self.sp_values, other) + + if op_name == 'divmod': + left, right = result + lfill, rfill = fill + return (_wrap_result(op_name, left, self.sp_index, lfill), + _wrap_result(op_name, right, self.sp_index, rfill)) + + return _wrap_result(op_name, result, self.sp_index, fill) + + else: + other = np.asarray(other) + with np.errstate(all='ignore'): + # TODO: delete sparse stuff in core/ops.py + # TODO: look into _wrap_result + if len(self) != len(other): + raise AssertionError( + ("length mismatch: {self} vs. {other}".format( + self=len(self), other=len(other)))) + if not isinstance(other, SparseArray): + dtype = getattr(other, 'dtype', None) + other = SparseArray(other, fill_value=self.fill_value, + dtype=dtype) + return _sparse_array_op(self, other, op, op_name) + + name = '__{name}__'.format(name=op.__name__) + return compat.set_function_name(sparse_arithmetic_method, name, cls) + + @classmethod + def _create_comparison_method(cls, op): + def cmp_method(self, other): + op_name = op.__name__ + + if op_name in {'and_', 'or_'}: + op_name = op_name[:-1] + + if isinstance(other, (ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + if not is_scalar(other) and not isinstance(other, type(self)): + # convert list-like to ndarray + other = np.asarray(other) + + if isinstance(other, np.ndarray): + # TODO: make this more flexible than just ndarray... + if len(self) != len(other): + raise AssertionError("length mismatch: {self} vs. {other}" + .format(self=len(self), + other=len(other))) + other = SparseArray(other, fill_value=self.fill_value) + + if isinstance(other, SparseArray): + return _sparse_array_op(self, other, op, op_name) + else: + with np.errstate(all='ignore'): + fill_value = op(self.fill_value, other) + result = op(self.sp_values, other) + + return type(self)(result, + sparse_index=self.sp_index, + fill_value=fill_value, + dtype=np.bool_) + + name = '__{name}__'.format(name=op.__name__) + return compat.set_function_name(cmp_method, name, cls) + + @classmethod + def _add_unary_ops(cls): + cls.__pos__ = cls._create_unary_method(operator.pos) + cls.__neg__ = cls._create_unary_method(operator.neg) + cls.__invert__ = cls._create_unary_method(operator.invert) + + @classmethod + def _add_comparison_ops(cls): + cls.__and__ = cls._create_comparison_method(operator.and_) + cls.__or__ = cls._create_comparison_method(operator.or_) + super(SparseArray, cls)._add_comparison_ops() + + # ---------- + # Formatting + # ----------- + def __unicode__(self): + return '{self}\nFill: {fill}\n{index}'.format( + self=printing.pprint_thing(self), + fill=printing.pprint_thing(self.fill_value), + index=printing.pprint_thing(self.sp_index)) + + def _formatter(self, boxed=False): + # Defer to the formatter from the GenericArrayFormatter calling us. + # This will infer the correct formatter from the dtype of the values. + return None + + +SparseArray._add_arithmetic_ops() +SparseArray._add_comparison_ops() +SparseArray._add_unary_ops() + + +def _maybe_to_dense(obj): + """ + try to convert to dense + """ + if hasattr(obj, 'to_dense'): + return obj.to_dense() + return obj + + +def _maybe_to_sparse(array): + """ + array must be SparseSeries or SparseArray + """ + if isinstance(array, ABCSparseSeries): + array = array.values.copy() + return array + + +def _sanitize_values(arr): + """ + return an ndarray for our input, + in a platform independent manner + """ + + if hasattr(arr, 'values'): + arr = arr.values + else: + + # scalar + if is_scalar(arr): + arr = [arr] + + # ndarray + if isinstance(arr, np.ndarray): + pass + + elif is_list_like(arr) and len(arr) > 0: + arr = maybe_convert_platform(arr) + + else: + arr = np.asarray(arr) + + return arr + + +def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): + """ + Convert ndarray to sparse format + + Parameters + ---------- + arr : ndarray + kind : {'block', 'integer'} + fill_value : NaN or another value + dtype : np.dtype, optional + copy : bool, default False + + Returns + ------- + (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) + """ + + arr = _sanitize_values(arr) + + if arr.ndim > 1: + raise TypeError("expected dimension <= 1 data") + + if fill_value is None: + fill_value = na_value_for_dtype(arr.dtype) + + if isna(fill_value): + mask = notna(arr) + else: + # For str arrays in NumPy 1.12.0, operator!= below isn't + # element-wise but just returns False if fill_value is not str, + # so cast to object comparison to be safe + if is_string_dtype(arr): + arr = arr.astype(object) + + if is_object_dtype(arr.dtype): + # element-wise equality check method in numpy doesn't treat + # each element type, eg. 0, 0.0, and False are treated as + # same. So we have to check the both of its type and value. + mask = splib.make_mask_object_ndarray(arr, fill_value) + else: + mask = arr != fill_value + + length = len(arr) + if length != len(mask): + # the arr is a SparseArray + indices = mask.sp_index.indices + else: + indices = mask.nonzero()[0].astype(np.int32) + + index = _make_index(length, indices, kind) + sparsified_values = arr[mask] + if dtype is not None: + sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) + # TODO: copy + return sparsified_values, index, fill_value + + +def _make_index(length, indices, kind): + + if kind == 'block' or isinstance(kind, BlockIndex): + locs, lens = splib.get_blocks(indices) + index = BlockIndex(length, locs, lens) + elif kind == 'integer' or isinstance(kind, IntIndex): + index = IntIndex(length, indices) + else: # pragma: no cover + raise ValueError('must be block or integer type') + return index + + +# ---------------------------------------------------------------------------- +# Accessor + +@delegate_names(SparseArray, ['npoints', 'density', 'fill_value', + 'sp_values'], + typ='property') +class SparseAccessor(PandasDelegate): + """ + Accessor for SparseSparse from other sparse matrix data types. + """ + + def __init__(self, data=None): + self._validate(data) + # Store the Series since we need that for to_coo + self._parent = data + + @staticmethod + def _validate(data): + if not isinstance(data.dtype, SparseDtype): + msg = "Can only use the '.sparse' accessor with Sparse data." + raise AttributeError(msg) + + def _delegate_property_get(self, name, *args, **kwargs): + return getattr(self._parent.values, name) + + def _delegate_method(self, name, *args, **kwargs): + if name == 'from_coo': + return self.from_coo(*args, **kwargs) + elif name == 'to_coo': + return self.to_coo(*args, **kwargs) + else: + raise ValueError + + @classmethod + def from_coo(cls, A, dense_index=False): + """ + Create a SparseSeries from a scipy.sparse.coo_matrix. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + If False (default), the SparseSeries index consists of only the + coords of the non-null entries of the original coo_matrix. + If True, the SparseSeries index consists of the full sorted + (row, col) coordinates of the coo_matrix. + + Returns + ------- + s : SparseSeries + + Examples + --------- + >>> from scipy import sparse + >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), + shape=(3, 4)) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 2.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> ss = pd.SparseSeries.from_coo(A) + >>> ss + 0 2 1 + 3 2 + 1 0 3 + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([3], dtype=int32) + """ + from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series + from pandas import Series + + result = _coo_to_sparse_series(A, dense_index=dense_index) + # SparseSeries -> Series[sparse] + result = Series(result.values, index=result.index, copy=False) + + return result + + def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): + """ + Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. + + Use row_levels and column_levels to determine the row and column + coordinates respectively. row_levels and column_levels are the names + (labels) or numbers of the levels. {row_levels, column_levels} must be + a partition of the MultiIndex level names (or numbers). + + Parameters + ---------- + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + + Returns + ------- + y : scipy.sparse.coo_matrix + rows : list (row labels) + columns : list (column labels) + + Examples + -------- + >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) + >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + >>> ss = s.to_sparse() + >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'], + column_levels=['C', 'D'], + sort_labels=True) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 3.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> rows + [(1, 1), (1, 2), (2, 1)] + >>> columns + [('a', 0), ('a', 1), ('b', 0), ('b', 1)] + """ + from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo + + A, rows, columns = _sparse_series_to_coo(self._parent, + row_levels, + column_levels, + sort_labels=sort_labels) + return A, rows, columns diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index df9e57cb5f0e1..9b7e1986e4831 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1,25 +1,33 @@ # -*- coding: utf-8 -*- +from __future__ import division + from datetime import timedelta +import warnings import numpy as np -from pandas._libs import tslibs -from pandas._libs.tslibs import Timedelta, Timestamp, NaT, iNaT +from pandas._libs import algos, lib, tslibs +from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field -from pandas._libs.tslibs.timedeltas import array_to_timedelta64 - -from pandas import compat +from pandas._libs.tslibs.timedeltas import ( + array_to_timedelta64, parse_timedelta_unit) +import pandas.compat as compat +from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_int64, is_timedelta64_dtype, is_list_like) -from pandas.core.dtypes.generic import ABCSeries + _TD_DTYPE, ensure_int64, is_datetime64_dtype, is_float_dtype, + is_integer_dtype, is_list_like, is_object_dtype, is_scalar, + is_string_dtype, is_timedelta64_dtype) +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCIndexClass, ABCSeries, ABCTimedeltaIndex) from pandas.core.dtypes.missing import isna +from pandas.core import ops +from pandas.core.algorithms import checked_add_with_arr, unique1d import pandas.core.common as com -from pandas.core.algorithms import checked_add_with_arr -from pandas.tseries.offsets import Tick from pandas.tseries.frequencies import to_offset +from pandas.tseries.offsets import Tick from . import datetimelike as dtl @@ -46,12 +54,13 @@ def f(self): values = self.asi8 result = get_timedelta_field(values, alias) if self.hasnans: - result = self._maybe_mask_results(result, convert='float64') + result = self._maybe_mask_results(result, fill_value=None, + convert='float64') return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -62,25 +71,29 @@ def _td_array_cmp(cls, op): opname = '__{name}__'.format(name=op.__name__) nat_result = True if opname == '__ne__' else False + meth = getattr(dtl.DatetimeLikeArrayMixin, opname) + def wrapper(self, other): - msg = "cannot compare a {cls} with type {typ}" - meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta - raise TypeError(msg.format(cls=type(self).__name__, - typ=type(other).__name__)) + return ops.invalid_comparison(self, other, op) + result = meth(self, other) if isna(other): result.fill(nat_result) elif not is_list_like(other): - raise TypeError(msg.format(cls=type(self).__name__, - typ=type(other).__name__)) + return ops.invalid_comparison(self, other, op) + else: - other = type(self)(other).values + try: + other = type(self)(other)._data + except (ValueError, TypeError): + return ops.invalid_comparison(self, other, op) + result = meth(self, other) result = com.values_from_object(result) @@ -96,7 +109,13 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin): +class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): + _typ = "timedeltaarray" + __array_priority__ = 1000 + + # Needed so that NaT.__richcmp__(DateTimeArray) operates pointwise + ndim = 1 + @property def _box_func(self): return lambda x: Timedelta(x, unit='ns') @@ -110,47 +129,56 @@ def dtype(self): _attributes = ["freq"] @classmethod - def _simple_new(cls, values, freq=None, **kwargs): - values = np.array(values, copy=False) - if values.dtype == np.object_: - values = array_to_timedelta64(values) - if values.dtype != _TD_DTYPE: - if is_timedelta64_dtype(values): - # non-nano unit - values = values.astype(_TD_DTYPE) - else: - values = ensure_int64(values).view(_TD_DTYPE) + def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): + # `dtype` is passed by _shallow_copy in corner cases, should always + # be timedelta64[ns] if present + assert dtype == _TD_DTYPE + assert isinstance(values, np.ndarray), type(values) + + if values.dtype == 'i8': + values = values.view('m8[ns]') + + assert values.dtype == 'm8[ns]' result = object.__new__(cls) result._data = values result._freq = freq return result - def __new__(cls, values, freq=None, start=None, end=None, periods=None, - closed=None): + def __new__(cls, values, freq=None, dtype=_TD_DTYPE, copy=False): + return cls._from_sequence(values, dtype=dtype, copy=copy, freq=freq) + + @classmethod + def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, + freq=None, unit=None): + if dtype != _TD_DTYPE: + raise ValueError("Only timedelta64[ns] dtype is valid.") freq, freq_infer = dtl.maybe_infer_freq(freq) - if values is None: - # TODO: Remove this block and associated kwargs; GH#20535 - if freq is None and com._any_none(periods, start, end): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') - periods = dtl.validate_periods(periods) - return cls._generate_range(start, end, periods, freq, - closed=closed) - - result = cls._simple_new(values, freq=freq) - if freq_infer: - inferred = result.inferred_freq - if inferred: - result.freq = to_offset(inferred) + data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, + freq_infer) + + result = cls._simple_new(data, freq=freq) + + if inferred_freq is None and freq is not None: + # this condition precludes `freq_infer` + cls._validate_frequency(result, freq) + + elif freq_infer: + result.freq = to_offset(result.inferred_freq) return result @classmethod - def _generate_range(cls, start, end, periods, freq, closed=None, **kwargs): - # **kwargs are for compat with TimedeltaIndex, which includes `name` + def _generate_range(cls, start, end, periods, freq, closed=None): + + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError('Must provide freq argument if no data is ' + 'supplied') + if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, ' 'and freq, exactly three must be specified') @@ -170,18 +198,44 @@ def _generate_range(cls, start, end, periods, freq, closed=None, **kwargs): if freq is not None: index = _generate_regular_range(start, end, periods, freq) - index = cls._simple_new(index, freq=freq, **kwargs) else: index = np.linspace(start.value, end.value, periods).astype('i8') - # TODO: shouldn't we pass `name` here? (via **kwargs) - index = cls._simple_new(index, freq=freq) if not left_closed: index = index[1:] if not right_closed: index = index[:-1] - return index + return cls._simple_new(index, freq=freq) + + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) + def _validate_fill_value(self, fill_value): + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)): + fill_value = Timedelta(fill_value).value + else: + raise ValueError("'fill_value' should be a Timedelta. " + "Got '{got}'.".format(got=fill_value)) + return fill_value + + # monotonicity/uniqueness properties are called via frequencies.infer_freq, + # see GH#23789 + + @property + def _is_monotonic_increasing(self): + return algos.is_monotonic(self.asi8, timelike=True)[0] + + @property + def _is_monotonic_decreasing(self): + return algos.is_monotonic(self.asi8, timelike=True)[1] + + @property + def _is_unique(self): + return len(unique1d(self.asi8)) == len(self) # ---------------------------------------------------------------- # Arithmetic Methods @@ -194,61 +248,53 @@ def _add_offset(self, other): .format(typ=type(other).__name__, cls=type(self).__name__)) - def _sub_datelike(self, other): - assert other is not NaT - raise TypeError("cannot subtract a datelike from a {cls}" - .format(cls=type(self).__name__)) - def _add_delta(self, delta): """ Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self. + to self, yielding a new TimedeltaArray. Parameters ---------- - delta : timedelta, np.timedelta64, Tick, TimedeltaArray, TimedeltaIndex + other : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} Returns ------- - result : same type as self + result : TimedeltaArray + """ + new_values = dtl.DatetimeLikeArrayMixin._add_delta(self, delta) + return type(self)(new_values, freq='infer') - Notes - ----- - The result's name is set outside of _add_delta by the calling - method (__add__ or __sub__) + def _add_datetime_arraylike(self, other): """ - if isinstance(delta, (Tick, timedelta, np.timedelta64)): - new_values = self._add_delta_td(delta) - elif isinstance(delta, TimedeltaArrayMixin): - new_values = self._add_delta_tdi(delta) - elif is_timedelta64_dtype(delta): - # ndarray[timedelta64] --> wrap in TimedeltaArray/Index - delta = type(self)(delta) - new_values = self._add_delta_tdi(delta) - else: - raise TypeError("cannot add the type {0} to a TimedeltaIndex" - .format(type(delta))) + Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray. + """ + if isinstance(other, np.ndarray): + # At this point we have already checked that dtype is datetime64 + from pandas.core.arrays import DatetimeArrayMixin + other = DatetimeArrayMixin(other) - return type(self)(new_values, freq='infer') + # defer to implementation in DatetimeArray + return other + self - def _add_datelike(self, other): + def _add_datetimelike_scalar(self, other): # adding a timedeltaindex to a datetimelike from pandas.core.arrays import DatetimeArrayMixin - if isinstance(other, (DatetimeArrayMixin, np.ndarray)): - # if other is an ndarray, we assume it is datetime64-dtype - # defer to implementation in DatetimeIndex - if not isinstance(other, DatetimeArrayMixin): - other = DatetimeArrayMixin(other) - return other + self - else: - assert other is not NaT - other = Timestamp(other) - i8 = self.asi8 - result = checked_add_with_arr(i8, other.value, - arr_mask=self._isnan) - result = self._maybe_mask_results(result, fill_value=iNaT) + + assert other is not NaT + other = Timestamp(other) + if other is NaT: + # In this case we specifically interpret NaT as a datetime, not + # the timedelta interpretation we would get by returning self + NaT + result = self.asi8.view('m8[ms]') + NaT.to_datetime64() return DatetimeArrayMixin(result) + i8 = self.asi8 + result = checked_add_with_arr(i8, other.value, + arr_mask=self._isnan) + result = self._maybe_mask_results(result) + return DatetimeArrayMixin(result, tz=other.tz, freq=self.freq) + def _addsub_offset_array(self, other, op): # Add or subtract Array-like of DateOffset objects try: @@ -261,31 +307,310 @@ def _addsub_offset_array(self, other, op): raise TypeError("Cannot add/subtract non-tick DateOffset to {cls}" .format(cls=type(self).__name__)) - def _evaluate_with_timedelta_like(self, other, op): - if isinstance(other, ABCSeries): - # GH#19042 + def __mul__(self, other): + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + return NotImplemented + + if is_scalar(other): + # numpy will accept float and int, raise TypeError for others + result = self._data * other + freq = None + if self.freq is not None and not isna(other): + freq = self.freq * other + return type(self)(result, freq=freq) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self) and not is_timedelta64_dtype(other): + # Exclude timedelta64 here so we correctly raise TypeError + # for that instead of ValueError + raise ValueError("Cannot multiply with unequal lengths") + + if is_object_dtype(other): + # this multiplication will succeed only if all elements of other + # are int or float scalars, so we will end up with + # timedelta64[ns]-dtyped result + result = [self[n] * other[n] for n in range(len(self))] + result = np.array(result) + return type(self)(result) + + # numpy will accept float or int dtype, raise TypeError for others + result = self._data * other + return type(self)(result) + + __rmul__ = __mul__ + + def __truediv__(self, other): + # timedelta / X is well-defined for timedelta-like or numeric X + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # specifically timedelta64-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # otherwise, dispatch to Timedelta implementation + return self._data / other + + elif lib.is_scalar(other): + # assume it is numeric + result = self._data / other + freq = None + if self.freq is not None: + # Tick division is not implemented, so operate on Timedelta + freq = self.freq.delta / other + return type(self)(result, freq=freq) + + if not hasattr(other, "dtype"): + # e.g. list, tuple + other = np.array(other) + + if len(other) != len(self): + raise ValueError("Cannot divide vectors with unequal lengths") + + elif is_timedelta64_dtype(other): + # let numpy handle it + return self._data / other + + elif is_object_dtype(other): + # Note: we do not do type inference on the result, so either + # an object array or numeric-dtyped (if numpy does inference) + # will be returned. GH#23829 + result = [self[n] / other[n] for n in range(len(self))] + result = np.array(result) + return result + + else: + result = self._data / other + return type(self)(result) + + def __rtruediv__(self, other): + # X / timedelta is defined only for timedelta-like X + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): return NotImplemented - opstr = '__{opname}__'.format(opname=op.__name__).replace('__r', '__') - # allow division by a timedelta - if opstr in ['__div__', '__truediv__', '__floordiv__']: - if _is_convertible_to_td(other): + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # specifically timedelta64-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # otherwise, dispatch to Timedelta implementation + return other / self._data + + elif lib.is_scalar(other): + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=type(other).__name__, + cls=type(self).__name__)) + + if not hasattr(other, "dtype"): + # e.g. list, tuple + other = np.array(other) + + if len(other) != len(self): + raise ValueError("Cannot divide vectors with unequal lengths") + + elif is_timedelta64_dtype(other): + # let numpy handle it + return other / self._data + + elif is_object_dtype(other): + # Note: unlike in __truediv__, we do not _need_ to do type# + # inference on the result. It does not raise, a numeric array + # is returned. GH#23829 + result = [other[n] / self[n] for n in range(len(self))] + return np.array(result) + + else: + raise TypeError("Cannot divide {dtype} data by {cls}" + .format(dtype=other.dtype, + cls=type(self).__name__)) + + if compat.PY2: + __div__ = __truediv__ + __rdiv__ = __rtruediv__ + + def __floordiv__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if is_scalar(other): + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # treat this specifically as timedelta-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # dispatch to Timedelta implementation + result = other.__rfloordiv__(self._data) + return result + + # at this point we should only have numeric scalars; anything + # else will raise + result = self.asi8 // other + result[self._isnan] = iNaT + freq = None + if self.freq is not None: + # Note: freq gets division, not floor-division + freq = self.freq / other + return type(self)(result.view('m8[ns]'), freq=freq) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self): + raise ValueError("Cannot divide with unequal lengths") + + elif is_timedelta64_dtype(other): + other = type(self)(other) + + # numpy timedelta64 does not natively support floordiv, so operate + # on the i8 values + result = self.asi8 // other.asi8 + mask = self._isnan | other._isnan + if mask.any(): + result = result.astype(np.int64) + result[mask] = np.nan + return result + + elif is_object_dtype(other): + result = [self[n] // other[n] for n in range(len(self))] + result = np.array(result) + if lib.infer_dtype(result) == 'timedelta': + result, _ = sequence_to_td64ns(result) + return type(self)(result) + return result + + elif is_integer_dtype(other) or is_float_dtype(other): + result = self._data // other + return type(self)(result) + + else: + dtype = getattr(other, "dtype", type(other).__name__) + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=dtype, cls=type(self).__name__)) + + def __rfloordiv__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if is_scalar(other): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) - if isna(other): - raise NotImplementedError( - "division by pd.NaT not implemented") - - i8 = self.asi8 - left, right = i8, other.value - - if opstr in ['__floordiv__']: - result = op(left, right) - else: - result = op(left, np.float64(right)) - result = self._maybe_mask_results(result, convert='float64') + if other is NaT: + # treat this specifically as timedelta-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # dispatch to Timedelta implementation + result = other.__floordiv__(self._data) return result - return NotImplemented + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=type(other).__name__, + cls=type(self).__name__)) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self): + raise ValueError("Cannot divide with unequal lengths") + + elif is_timedelta64_dtype(other): + other = type(self)(other) + + # numpy timedelta64 does not natively support floordiv, so operate + # on the i8 values + result = other.asi8 // self.asi8 + mask = self._isnan | other._isnan + if mask.any(): + result = result.astype(np.int64) + result[mask] = np.nan + return result + + elif is_object_dtype(other): + result = [other[n] // self[n] for n in range(len(self))] + result = np.array(result) + return result + + else: + dtype = getattr(other, "dtype", type(other).__name__) + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=dtype, cls=type(self).__name__)) + + def __mod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + return self - (self // other) * other + + def __rmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + return other - (other // self) * self + + def __divmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + + res1 = self // other + res2 = self - res1 * other + return res1, res2 + + def __rdivmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + + res1 = other // self + res2 = other - res1 * self + return res1, res2 + + # Note: TimedeltaIndex overrides this in call to cls._add_numeric_methods + def __neg__(self): + if self.freq is not None: + return type(self)(-self._data, freq=-self.freq) + return type(self)(-self._data) + + def __abs__(self): + # Note: freq is not preserved + return type(self)(np.abs(self._data)) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timedelta methods @@ -345,12 +670,12 @@ def total_seconds(self): Float64Index([0.0, 86400.0, 172800.0, 259200.00000000003, 345600.0], dtype='float64') """ - return self._maybe_mask_results(1e-9 * self.asi8) + return self._maybe_mask_results(1e-9 * self.asi8, fill_value=None) def to_pytimedelta(self): """ Return Timedelta Array/Index as object ndarray of datetime.timedelta - objects + objects. Returns ------- @@ -359,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - " Number of days for each element. ") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - " Number of seconds (>= 0 and less than 1 day) " - "for each element. ") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each\nelement. ") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each\nelement.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): @@ -401,12 +726,177 @@ def f(x): TimedeltaArrayMixin._add_comparison_ops() -TimedeltaArrayMixin._add_datetimelike_methods() # --------------------------------------------------------------------- # Constructor Helpers +def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): + """ + Parameters + ---------- + array : list-like + copy : bool, default False + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + converted : numpy.ndarray + The sequence converted to a numpy array with dtype ``timedelta64[ns]``. + inferred_freq : Tick or None + The inferred frequency of the sequence. + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + inferred_freq = None + unit = parse_timedelta_unit(unit) + + # Unwrap whatever we have into a np.ndarray + if not hasattr(data, 'dtype'): + # e.g. list, tuple + if np.ndim(data) == 0: + # i.e. generator + data = list(data) + data = np.array(data, copy=False) + elif isinstance(data, ABCSeries): + data = data._values + elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArrayMixin)): + inferred_freq = data.freq + data = data._data + + # Convert whatever we have into timedelta64[ns] dtype + if is_object_dtype(data) or is_string_dtype(data): + # no need to make a copy, need to convert if string-dtyped + data = objects_to_td64ns(data, unit=unit, errors=errors) + copy = False + + elif is_integer_dtype(data): + # treat as multiples of the given unit + data, copy_made = ints_to_td64ns(data, unit=unit) + copy = copy and not copy_made + + elif is_float_dtype(data): + # treat as multiples of the given unit. If after converting to nanos, + # there are fractional components left, these are truncated + # (i.e. NOT rounded) + mask = np.isnan(data) + coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns') + data = (coeff * data).astype(np.int64).view('timedelta64[ns]') + data[mask] = iNaT + copy = False + + elif is_timedelta64_dtype(data): + if data.dtype != _TD_DTYPE: + # non-nano unit + # TODO: watch out for overflows + data = data.astype(_TD_DTYPE) + copy = False + + elif is_datetime64_dtype(data): + # GH#23539 + warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " + "deprecated, will raise a TypeError in a future " + "version", + FutureWarning, stacklevel=4) + data = ensure_int64(data).view(_TD_DTYPE) + + else: + raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]" + .format(dtype=data.dtype)) + + data = np.array(data, copy=copy) + assert data.dtype == 'm8[ns]', data + return data, inferred_freq + + +def ints_to_td64ns(data, unit="ns"): + """ + Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating + the integers as multiples of the given timedelta unit. + + Parameters + ---------- + data : numpy.ndarray with integer-dtype + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + bool : whether a copy was made + """ + copy_made = False + unit = unit if unit is not None else "ns" + + if data.dtype != np.int64: + # converting to int64 makes a copy, so we can avoid + # re-copying later + data = data.astype(np.int64) + copy_made = True + + if unit != "ns": + dtype_str = "timedelta64[{unit}]".format(unit=unit) + data = data.view(dtype_str) + + # TODO: watch out for overflows when converting from lower-resolution + data = data.astype("timedelta64[ns]") + # the astype conversion makes a copy, so we can avoid re-copying later + copy_made = True + + else: + data = data.view("timedelta64[ns]") + + return data, copy_made + + +def objects_to_td64ns(data, unit="ns", errors="raise"): + """ + Convert a object-dtyped or string-dtyped array into an + timedelta64[ns]-dtyped array. + + Parameters + ---------- + data : ndarray or Index + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + # coerce Index to np.ndarray, converting string-dtype if necessary + values = np.array(data, dtype=np.object_, copy=False) + + result = array_to_timedelta64(values, + unit=unit, errors=errors) + return result.view('timedelta64[ns]') + + def _generate_regular_range(start, end, periods, offset): stride = offset.nanos if periods is None: diff --git a/pandas/core/base.py b/pandas/core/base.py index 00c049497c0d8..e7c3a45a710e0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1,32 +1,28 @@ """ Base and utility classes for pandas objects. """ -import warnings import textwrap -from pandas import compat -from pandas.compat import builtins -import numpy as np +import warnings -from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.core.dtypes.common import ( - is_datetimelike, - is_object_dtype, - is_list_like, - is_scalar, - is_extension_type, - is_extension_array_dtype) +import numpy as np -from pandas.util._validators import validate_bool_kwarg -from pandas.errors import AbstractMethodError -from pandas.core import common as com, algorithms -import pandas.core.nanops as nanops import pandas._libs.lib as lib +import pandas.compat as compat +from pandas.compat import PYPY, OrderedDict, builtins from pandas.compat.numpy import function as nv -from pandas.compat import PYPY, OrderedDict -from pandas.util._decorators import Appender, cache_readonly, Substitution +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.common import ( + is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, + is_extension_type, is_list_like, is_object_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms, common as com from pandas.core.accessor import DirNamesMixin +import pandas.core.nanops as nanops _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', @@ -178,11 +174,13 @@ class SelectionMixin(object): _selection = None _internal_names = ['_cache', '__setstate__'] _internal_names_set = set(_internal_names) + _builtin_table = OrderedDict(( (builtins.sum, np.sum), (builtins.max, np.max), (builtins.min, np.min), )) + _cython_table = OrderedDict(( (builtins.sum, 'sum'), (builtins.max, 'max'), @@ -190,15 +188,25 @@ class SelectionMixin(object): (np.all, 'all'), (np.any, 'any'), (np.sum, 'sum'), + (np.nansum, 'sum'), (np.mean, 'mean'), + (np.nanmean, 'mean'), (np.prod, 'prod'), + (np.nanprod, 'prod'), (np.std, 'std'), + (np.nanstd, 'std'), (np.var, 'var'), + (np.nanvar, 'var'), (np.median, 'median'), + (np.nanmedian, 'median'), (np.max, 'max'), + (np.nanmax, 'max'), (np.min, 'min'), + (np.nanmin, 'min'), (np.cumprod, 'cumprod'), + (np.nancumprod, 'cumprod'), (np.cumsum, 'cumsum'), + (np.nancumsum, 'cumsum'), )) @property @@ -395,8 +403,8 @@ def nested_renaming_depr(level=4): elif isinstance(obj, ABCSeries): nested_renaming_depr() - elif isinstance(obj, ABCDataFrame) and \ - k not in obj.columns: + elif (isinstance(obj, ABCDataFrame) and + k not in obj.columns): raise KeyError( "Column '{col}' does not exist!".format(col=k)) @@ -623,7 +631,9 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): return result def _shallow_copy(self, obj=None, obj_type=None, **kwargs): - """ return a new object with the replacement attributes """ + """ + return a new object with the replacement attributes + """ if obj is None: obj = self._selected_obj.copy() if obj_type is None: @@ -636,7 +646,9 @@ def _shallow_copy(self, obj=None, obj_type=None, **kwargs): return obj_type(obj, **kwargs) def _is_cython_func(self, arg): - """ if we define an internal function for this argument, return it """ + """ + if we define an internal function for this argument, return it + """ return self._cython_table.get(arg) def _is_builtin_func(self, arg): @@ -656,16 +668,19 @@ class IndexOpsMixin(object): __array_priority__ = 1000 def transpose(self, *args, **kwargs): - """ return the transpose, which is by definition self """ + """ + Return the transpose, which is by definition self. + """ nv.validate_transpose(args, kwargs) return self - T = property(transpose, doc="return the transpose, which is by " - "definition self") + T = property(transpose, doc="Return the transpose, which is by " + "definition self.") @property def _is_homogeneous_type(self): - """Whether the object has a single dtype. + """ + Whether the object has a single dtype. By definition, Series and Index are always considered homogeneous. A MultiIndex may or may not be homogeneous, depending on the @@ -680,19 +695,21 @@ def _is_homogeneous_type(self): @property def shape(self): - """ return a tuple of the shape of the underlying data """ + """ + Return a tuple of the shape of the underlying data. + """ return self._values.shape @property def ndim(self): - """ return the number of dimensions of the underlying data, - by definition 1 + """ + Number of dimensions of the underlying data, by definition 1. """ return 1 def item(self): - """ return the first element of the underlying data as a python - scalar + """ + Return the first element of the underlying data as a python scalar. """ try: return self.values.item() @@ -703,7 +720,9 @@ def item(self): @property def data(self): - """ return the data pointer of the underlying data """ + """ + Return the data pointer of the underlying data. + """ warnings.warn("{obj}.data is deprecated and will be removed " "in a future version".format(obj=type(self).__name__), FutureWarning, stacklevel=2) @@ -711,7 +730,9 @@ def data(self): @property def itemsize(self): - """ return the size of the dtype of the item of the underlying data """ + """ + Return the size of the dtype of the item of the underlying data. + """ warnings.warn("{obj}.itemsize is deprecated and will be removed " "in a future version".format(obj=type(self).__name__), FutureWarning, stacklevel=2) @@ -719,12 +740,16 @@ def itemsize(self): @property def nbytes(self): - """ return the number of bytes in the underlying data """ + """ + Return the number of bytes in the underlying data. + """ return self._values.nbytes @property def strides(self): - """ return the strides of the underlying data """ + """ + Return the strides of the underlying data. + """ warnings.warn("{obj}.strides is deprecated and will be removed " "in a future version".format(obj=type(self).__name__), FutureWarning, stacklevel=2) @@ -732,12 +757,16 @@ def strides(self): @property def size(self): - """ return the number of elements in the underlying data """ + """ + Return the number of elements in the underlying data. + """ return self._values.size @property def flags(self): - """ return the ndarray.flags for the underlying data """ + """ + Return the ndarray.flags for the underlying data. + """ warnings.warn("{obj}.flags is deprecated and will be removed " "in a future version".format(obj=type(self).__name__), FutureWarning, stacklevel=2) @@ -745,18 +774,138 @@ def flags(self): @property def base(self): - """ return the base object if the memory of the underlying data is - shared + """ + Return the base object if the memory of the underlying data is shared. """ warnings.warn("{obj}.base is deprecated and will be removed " "in a future version".format(obj=type(self).__name__), FutureWarning, stacklevel=2) return self.values.base + @property + def array(self): + # type: () -> Union[np.ndarray, ExtensionArray] + """ + The actual Array backing this Series or Index. + + .. versionadded:: 0.24.0 + + Returns + ------- + array : numpy.ndarray or ExtensionArray + This is the actual array stored within this object. This differs + from ``.values`` which may require converting the data + to a different form. + + See Also + -------- + Index.to_numpy : Similar method that always returns a NumPy array. + Series.to_numpy : Similar method that always returns a NumPy array. + + Notes + ----- + This table lays out the different array types for each extension + dtype within pandas. + + ================== ============================= + dtype array type + ================== ============================= + category Categorical + period PeriodArray + interval IntervalArray + IntegerNA IntegerArray + datetime64[ns, tz] DatetimeArray + ================== ============================= + + For any 3rd-party extension types, the array type will be an + ExtensionArray. + + For all remaining dtypes ``.array`` will be the :class:`numpy.ndarray` + stored within. If you absolutely need a NumPy array (possibly with + copying / coercing data), then use :meth:`Series.to_numpy` instead. + + .. note:: + + ``.array`` will always return the underlying object backing the + Series or Index. If a future version of pandas adds a specialized + extension type for a data type, then the return type of ``.array`` + for that data type will change from an object-dtype ndarray to the + new ExtensionArray. + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.array + [a, b, a] + Categories (2, object): [a, b] + """ + return self._values + + def to_numpy(self): + """ + A NumPy ndarray representing the values in this Series or Index. + + .. versionadded:: 0.24.0 + + The returned array will be the same up to equality (values equal + in `self` will be equal in the returned array; likewise for values + that are not equal). When `self` contains an ExtensionArray, the + dtype may be different. For example, for a category-dtype Series, + ``to_numpy()`` will return a NumPy array and the categorical dtype + will be lost. + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.array : Get the actual data stored within. + Index.array : Get the actual data stored within. + DataFrame.to_numpy : Similar method for DataFrame. + + Notes + ----- + For NumPy dtypes, this will be a reference to the actual data stored + in this Series or Index. Modifying the result in place will modify + the data stored in the Series or Index (not that we recommend doing + that). + + For extension types, ``to_numpy()`` *may* require copying data and + coercing the result to a NumPy type (possibly object), which may be + expensive. When you need a no-copy reference to the underlying data, + :attr:`Series.array` should be used instead. + + This table lays out the different dtypes and return types of + ``to_numpy()`` for various dtypes within pandas. + + ================== ================================ + dtype array type + ================== ================================ + category[T] ndarray[T] (same dtype as input) + period ndarray[object] (Periods) + interval ndarray[object] (Intervals) + IntegerNA ndarray[object] + datetime64[ns, tz] ndarray[object] (Timestamps) + ================== ================================ + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.to_numpy() + array(['a', 'b', 'a'], dtype=object) + """ + if (is_extension_array_dtype(self.dtype) or + is_datetime64tz_dtype(self.dtype)): + # TODO(DatetimeArray): remove the second clause. + return np.asarray(self._values) + return self._values + @property def _ndarray_values(self): # type: () -> np.ndarray - """The data as an ndarray, possibly losing information. + """ + The data as an ndarray, possibly losing information. The expectation is that this is cheap to compute, and is primarily used for interacting with our indexers. @@ -806,9 +955,9 @@ def max(self): def argmax(self, axis=None): """ - return a ndarray of the maximum argument indexer + Return a ndarray of the maximum argument indexer. - See also + See Also -------- numpy.ndarray.argmax """ @@ -849,9 +998,9 @@ def min(self): def argmin(self, axis=None): """ - return a ndarray of the minimum argument indexer + Return a ndarray of the minimum argument indexer. - See also + See Also -------- numpy.ndarray.argmin """ @@ -888,8 +1037,10 @@ def __iter__(self): @cache_readonly def hasnans(self): - """ return if I have any nans; enables various perf speedups """ - return isna(self).any() + """ + Return if I have any nans; enables various perf speedups. + """ + return bool(isna(self).any()) def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): @@ -901,7 +1052,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, return func(**kwds) def _map_values(self, mapper, na_action=None): - """An internal function that maps values using the input + """ + An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters @@ -1002,8 +1154,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, See Also -------- - Series.count: number of non-NA elements in a Series - DataFrame.count: number of non-NA elements in a DataFrame + Series.count: Number of non-NA elements in a Series. + DataFrame.count: Number of non-NA elements in a DataFrame. Examples -------- @@ -1175,7 +1327,8 @@ def factorize(self, sort=False, na_sentinel=-1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs['searchsorted'] = ( - """Find indices where elements should be inserted to maintain order. + """ + Find indices where elements should be inserted to maintain order. Find the indices into a sorted %(klass)s `self` such that, if the corresponding elements in `value` were inserted before the indices, diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 530a3ecb5f378..43c35c4000bb6 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1,8 +1,9 @@ import warnings +from pandas.core.dtypes.dtypes import CategoricalDtype # noqa + +from pandas.core.arrays import Categorical # noqa + # TODO: Remove after 0.23.x warnings.warn("'pandas.core' is private. Use 'pandas.Categorical'", FutureWarning, stacklevel=2) - -from pandas.core.arrays import Categorical # noqa -from pandas.core.dtypes.dtypes import CategoricalDtype # noqa diff --git a/pandas/core/common.py b/pandas/core/common.py index 14e47936e1b50..b4de0daa13b16 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -4,23 +4,23 @@ Note: pandas.core.common is *not* part of the public API. """ +import collections from datetime import datetime, timedelta from functools import partial import inspect -import collections import numpy as np + from pandas._libs import lib, tslibs +import pandas.compat as compat +from pandas.compat import PY36, OrderedDict, iteritems -from pandas import compat -from pandas.compat import iteritems, PY36, OrderedDict -from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( - is_integer, is_bool_dtype, is_extension_array_dtype, is_array_like -) + is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike class SettingWithCopyError(ValueError): @@ -66,17 +66,6 @@ def consensus_name_attr(objs): return name -# TODO: only used once in frame.py; belongs elsewhere? -def get_info_slice(obj, indexer): - """Slice the info axis of `obj` with `indexer`.""" - if not hasattr(obj, '_info_axis_number'): - msg = 'object of type {typ!r} has no info axis' - raise TypeError(msg.format(typ=type(obj).__name__)) - slices = [slice(None)] * obj.ndim - slices[obj._info_axis_number] = indexer - return tuple(slices) - - def maybe_box(indexer, values, obj, key): # if we have multiples coming back, box em @@ -430,21 +419,6 @@ def random_state(state=None): "RandomState, or None") -# TODO: only used once in indexes.api; belongs elsewhere? -def get_distinct_objs(objs): - """ - Return a list with distinct elements of "objs" (different ids). - Preserves order. - """ - ids = set() - res = [] - for obj in objs: - if not id(obj) in ids: - ids.add(id(obj)) - res.append(obj) - return res - - def _pipe(obj, func, *args, **kwargs): """ Apply a function ``func`` to object ``obj`` either by passing obj as the @@ -478,3 +452,21 @@ def _pipe(obj, func, *args, **kwargs): return func(*args, **kwargs) else: return func(obj, *args, **kwargs) + + +def _get_rename_function(mapper): + """ + Returns a function that will map names/labels, dependent if mapper + is a dict, Series or just a function. + """ + if isinstance(mapper, (compat.Mapping, ABCSeries)): + + def f(x): + if x in mapper: + return mapper[x] + else: + return x + else: + f = mapper + + return f diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 22c8b641cf974..951174648091f 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -1,15 +1,16 @@ """Core eval alignment algorithms """ -import warnings from functools import partial, wraps -from pandas.compat import zip, range +import warnings import numpy as np +from pandas.compat import range, zip +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import compat -from pandas.errors import PerformanceWarning import pandas.core.common as com from pandas.core.computation.common import _result_type_many @@ -29,9 +30,8 @@ def _align_core_single_unary_op(term): def _zip_axes_from_type(typ, new_axes): - axes = {} - for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES): - axes[ax_name] = new_axes[ax_ind] + axes = {ax_name: new_axes[ax_ind] + for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES)} return axes diff --git a/pandas/core/computation/api.py b/pandas/core/computation/api.py index a6fe5aae822df..31e8a4873b0ad 100644 --- a/pandas/core/computation/api.py +++ b/pandas/core/computation/api.py @@ -1,14 +1,3 @@ # flake8: noqa from pandas.core.computation.eval import eval - - -# deprecation, xref #13790 -def Expr(*args, **kwargs): - import warnings - - warnings.warn("pd.Expr is deprecated as it is not " - "applicable to user code", - FutureWarning, stacklevel=2) - from pandas.core.computation.expr import Expr - return Expr(*args, **kwargs) diff --git a/pandas/core/computation/check.py b/pandas/core/computation/check.py index 2a9ed0fb9764d..d2d5e018063ff 100644 --- a/pandas/core/computation/check.py +++ b/pandas/core/computation/check.py @@ -1,8 +1,8 @@ -import warnings from distutils.version import LooseVersion +import warnings _NUMEXPR_INSTALLED = False -_MIN_NUMEXPR_VERSION = "2.4.6" +_MIN_NUMEXPR_VERSION = "2.6.1" try: import numexpr as ne diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 105cc497a4207..e7eca04e413c5 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -1,7 +1,9 @@ import numpy as np -import pandas as pd + from pandas.compat import reduce +import pandas as pd + def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 155ff554cf99c..bccd37131c81a 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -4,14 +4,14 @@ import abc -from pandas import compat from pandas.compat import map -import pandas.io.formats.printing as printing + +from pandas import compat from pandas.core.computation.align import _align, _reconstruct_object from pandas.core.computation.ops import ( - UndefinedVariableError, - _mathops, _reductions) + UndefinedVariableError, _mathops, _reductions) +import pandas.io.formats.printing as printing _ne_builtins = frozenset(_mathops + _reductions) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 7025f3000eb5f..4b9ba02ed85a4 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -3,14 +3,17 @@ """Top level ``eval`` module. """ -import warnings import tokenize -from pandas.io.formats.printing import pprint_thing -from pandas.core.computation.scope import _ensure_scope +import warnings + from pandas.compat import string_types -from pandas.core.computation.engines import _engines from pandas.util._validators import validate_bool_kwarg +from pandas.core.computation.engines import _engines +from pandas.core.computation.scope import _ensure_scope + +from pandas.io.formats.printing import pprint_thing + def _check_engine(engine): """Make sure a valid engine is passed. diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index b68b6970a89cc..9a44198ba3b86 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -2,25 +2,25 @@ """ import ast +from functools import partial import tokenize -from functools import partial import numpy as np +from pandas.compat import StringIO, lmap, reduce, string_types, zip + import pandas as pd from pandas import compat -from pandas.compat import StringIO, lmap, zip, reduce, string_types -from pandas.core.base import StringMixin from pandas.core import common as com -import pandas.io.formats.printing as printing -from pandas.core.reshape.util import compose +from pandas.core.base import StringMixin from pandas.core.computation.ops import ( - _cmp_ops_syms, _bool_ops_syms, - _arith_ops_syms, _unary_ops_syms, is_term) -from pandas.core.computation.ops import _reductions, _mathops, _LOCAL_TAG -from pandas.core.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div -from pandas.core.computation.ops import UndefinedVariableError, FuncNode + _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, + UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, + _mathops, _reductions, _unary_ops_syms, is_term) from pandas.core.computation.scope import Scope +from pandas.core.reshape.util import compose + +import pandas.io.formats.printing as printing def tokenize_string(source): diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index c12056a3ee78c..d44fae624a91c 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -7,6 +7,7 @@ """ import warnings + import numpy as np import pandas.core.common as com diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index ca0c4db4947c4..9e9f124352229 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -1,21 +1,23 @@ """Operator classes for eval. """ -import operator as op -from functools import partial from datetime import datetime +from functools import partial +import operator as op import numpy as np +from pandas.compat import PY3, string_types, text_type + from pandas.core.dtypes.common import is_list_like, is_scalar + import pandas as pd -from pandas.compat import PY3, string_types, text_type -import pandas.core.common as com -from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded from pandas.core.base import StringMixin +import pandas.core.common as com from pandas.core.computation.common import _ensure_decoded, _result_type_many from pandas.core.computation.scope import _DEFAULT_GLOBALS +from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded _reductions = 'sum', 'prod' diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index e08df3e340138..db409b215a78d 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -2,20 +2,24 @@ import ast from functools import partial + import numpy as np -import pandas as pd + +from pandas.compat import DeepChainMap, string_types, u from pandas.core.dtypes.common import is_list_like -import pandas.core.common as com -from pandas.compat import u, string_types, DeepChainMap + +import pandas as pd from pandas.core.base import StringMixin -from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded +import pandas.core.common as com from pandas.core.computation import expr, ops -from pandas.core.computation.ops import is_term, UndefinedVariableError -from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.common import _ensure_decoded +from pandas.core.computation.expr import BaseExprVisitor +from pandas.core.computation.ops import UndefinedVariableError, is_term from pandas.core.tools.timedeltas import _coerce_scalar_to_timedelta_type +from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded + class Scope(expr.Scope): __slots__ = 'queryables', diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index c3128be0f5599..33c5a1c2e0f0a 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -2,18 +2,18 @@ Module for scope operations """ -import sys -import struct -import inspect import datetime +import inspect import itertools import pprint +import struct +import sys import numpy as np -import pandas +from pandas.compat import DeepChainMap, StringIO, map + import pandas as pd # noqa -from pandas.compat import DeepChainMap, map, StringIO from pandas.core.base import StringMixin import pandas.core.computation as compu @@ -48,7 +48,7 @@ def _raw_hex_id(obj): _DEFAULT_GLOBALS = { - 'Timestamp': pandas._libs.tslib.Timestamp, + 'Timestamp': pd._libs.tslib.Timestamp, 'datetime': datetime.datetime, 'True': True, 'False': False, diff --git a/pandas/core/config.py b/pandas/core/config.py index f178600b74626..0f43ca65d187a 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -48,13 +48,13 @@ """ -import re - from collections import namedtuple from contextlib import contextmanager +import re import warnings -from pandas.compat import map, lmap, u + import pandas.compat as compat +from pandas.compat import lmap, map, u DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver') RegisteredOption = namedtuple('RegisteredOption', @@ -385,7 +385,6 @@ class option_context(object): >>> with option_context('display.max_rows', 10, 'display.max_columns', 5): ... ... - """ def __init__(self, *args): @@ -396,11 +395,8 @@ def __init__(self, *args): self.ops = list(zip(args[::2], args[1::2])) def __enter__(self): - undo = [] - for pat, val in self.ops: - undo.append((pat, _get_option(pat, silent=True))) - - self.undo = undo + self.undo = [(pat, _get_option(pat, silent=True)) + for pat, val in self.ops] for pat, val in self.ops: _set_option(pat, val, silent=True) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index b836a35b8cf29..d42a1ab72b156 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -10,8 +10,10 @@ """ import pandas.core.config as cf -from pandas.core.config import (is_int, is_bool, is_text, is_instance_factory, - is_one_of_factory, is_callable) +from pandas.core.config import ( + is_bool, is_callable, is_instance_factory, is_int, is_one_of_factory, + is_text) + from pandas.io.formats.console import detect_console_encoding from pandas.io.formats.terminal import is_terminal diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index 738e1ea9062f6..76021705563bf 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -1,82 +1,14 @@ # flake8: noqa -import sys - -from .common import (pandas_dtype, - is_dtype_equal, - is_extension_type, - - # categorical - is_categorical, - is_categorical_dtype, - - # interval - is_interval, - is_interval_dtype, - - # datetimelike - is_datetimetz, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_datetime64_any_dtype, - is_datetime64_ns_dtype, - is_timedelta64_dtype, - is_timedelta64_ns_dtype, - is_period, - is_period_dtype, - - # string-like - is_string_dtype, - is_object_dtype, - - # sparse - is_sparse, - - # numeric types - is_scalar, - is_sparse, - is_bool, - is_integer, - is_float, - is_complex, - is_number, - is_integer_dtype, - is_int64_dtype, - is_numeric_dtype, - is_float_dtype, - is_bool_dtype, - is_complex_dtype, - is_signed_integer_dtype, - is_unsigned_integer_dtype, - - # like - is_re, - is_re_compilable, - is_dict_like, - is_iterator, - is_file_like, - is_array_like, - is_list_like, - is_hashable, - is_named_tuple) - - -# deprecated -m = sys.modules['pandas.core.dtypes.api'] - -for t in ['is_any_int_dtype', 'is_floating_dtype', 'is_sequence']: - - def outer(t=t): - - def wrapper(arr_or_dtype): - import warnings - import pandas - warnings.warn("{t} is deprecated and will be " - "removed in a future version".format(t=t), - FutureWarning, stacklevel=3) - return getattr(pandas.core.dtypes.common, t)(arr_or_dtype) - return wrapper - - setattr(m, t, outer(t)) - -del sys, m, t, outer +from .common import ( + is_array_like, is_bool, is_bool_dtype, is_categorical, + is_categorical_dtype, is_complex, is_complex_dtype, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, + is_datetime64tz_dtype, is_datetimetz, is_dict_like, is_dtype_equal, + is_extension_type, is_file_like, is_float, is_float_dtype, is_hashable, + is_int64_dtype, is_integer, is_integer_dtype, is_interval, + is_interval_dtype, is_iterator, is_list_like, is_named_tuple, is_number, + is_numeric_dtype, is_object_dtype, is_period, is_period_dtype, is_re, + is_re_compilable, is_scalar, is_signed_integer_dtype, is_sparse, + is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, pandas_dtype) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index b0fa55e346613..aa81e88abf28e 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,10 +1,12 @@ """Extend pandas with custom array types""" import numpy as np -from pandas import compat -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCDataFrame from pandas.errors import AbstractMethodError +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + +from pandas import compat + class _DtypeOpsMixin(object): # Not all of pandas' extension dtypes are compatibile with @@ -22,14 +24,17 @@ class _DtypeOpsMixin(object): # of the NA value, not the physical NA vaalue for storage. # e.g. for JSONArray, this is an empty dictionary. na_value = np.nan + _metadata = () def __eq__(self, other): """Check whether 'other' is equal to self. - By default, 'other' is considered equal if + By default, 'other' is considered equal if either * it's a string matching 'self.name'. - * it's an instance of this type. + * it's an instance of this type and all of the + the attributes in ``self._metadata`` are equal between + `self` and `other`. Parameters ---------- @@ -40,11 +45,19 @@ def __eq__(self, other): bool """ if isinstance(other, compat.string_types): - return other == self.name - elif isinstance(other, type(self)): - return True - else: - return False + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return all( + getattr(self, attr) == getattr(other, attr) + for attr in self._metadata + ) + return False + + def __hash__(self): + return hash(tuple(getattr(self, attr) for attr in self._metadata)) def __ne__(self, other): return not self.__eq__(other) @@ -133,7 +146,8 @@ def _is_boolean(self): class ExtensionDtype(_DtypeOpsMixin): - """A custom data type, to be paired with an ExtensionArray. + """ + A custom data type, to be paired with an ExtensionArray. .. versionadded:: 0.23.0 @@ -161,6 +175,26 @@ class ExtensionDtype(_DtypeOpsMixin): The `na_value` class attribute can be used to set the default NA value for this type. :attr:`numpy.nan` is used by default. + ExtensionDtypes are required to be hashable. The base class provides + a default implementation, which relies on the ``_metadata`` class + attribute. ``_metadata`` should be a tuple containing the strings + that define your data type. For example, with ``PeriodDtype`` that's + the ``freq`` attribute. + + **If you have a parametrized dtype you should set the ``_metadata`` + class property**. + + Ideally, the attributes in ``_metadata`` will match the + parameters to your ``ExtensionDtype.__init__`` (if any). If any of + the attributes in ``_metadata`` don't implement the standard + ``__eq__`` or ``__hash__``, the default implementations here will not + work. + + .. versionchanged:: 0.24.0 + + Added ``_metadata``, ``__hash__``, and changed the default definition + of ``__eq__``. + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is @@ -178,7 +212,8 @@ def __str__(self): @property def type(self): # type: () -> type - """The scalar type for the array, e.g. ``int`` + """ + The scalar type for the array, e.g. ``int`` It's expected ``ExtensionArray[item]`` returns an instance of ``ExtensionDtype.type`` for scalar ``item``, assuming @@ -190,7 +225,8 @@ def type(self): @property def kind(self): # type () -> str - """A character code (one of 'biufcmMOSUV'), default 'O' + """ + A character code (one of 'biufcmMOSUV'), default 'O' This should match the NumPy dtype used when the array is converted to an ndarray, which is probably 'O' for object if @@ -206,7 +242,8 @@ def kind(self): @property def name(self): # type: () -> str - """A string identifying the data type. + """ + A string identifying the data type. Will be used for display in, e.g. ``Series.dtype`` """ @@ -214,7 +251,8 @@ def name(self): @classmethod def construct_array_type(cls): - """Return the array type associated with this dtype + """ + Return the array type associated with this dtype Returns ------- @@ -224,7 +262,8 @@ def construct_array_type(cls): @classmethod def construct_from_string(cls, string): - """Attempt to construct this type from a string. + """ + Attempt to construct this type from a string. Parameters ---------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a95a45d5f9ae4..eae9eb97f35fe 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -3,38 +3,26 @@ from datetime import datetime, timedelta import numpy as np -import warnings - -from pandas._libs import tslib, lib, tslibs -from pandas._libs.tslibs import iNaT, OutOfBoundsDatetime -from pandas.compat import string_types, text_type, PY3 -from .common import (ensure_object, is_bool, is_integer, is_float, - is_complex, is_datetimetz, is_categorical_dtype, - is_datetimelike, - is_extension_type, - is_extension_array_dtype, - is_object_dtype, - is_datetime64tz_dtype, is_datetime64_dtype, - is_datetime64_ns_dtype, - is_timedelta64_dtype, is_timedelta64_ns_dtype, - is_dtype_equal, - is_float_dtype, is_complex_dtype, - is_integer_dtype, - is_unsigned_integer_dtype, - is_datetime_or_timedelta_dtype, - is_bool_dtype, is_scalar, - is_string_dtype, _string_dtypes, - pandas_dtype, - ensure_int8, ensure_int16, - ensure_int32, ensure_int64, - _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, - _POSSIBLY_CAST_DTYPES) -from .dtypes import (ExtensionDtype, PandasExtensionDtype, DatetimeTZDtype, - PeriodDtype) -from .generic import (ABCDatetimeIndex, ABCPeriodIndex, - ABCSeries) -from .missing import isna, notna + +from pandas._libs import lib, tslib, tslibs +from pandas._libs.tslibs import OutOfBoundsDatetime, Period, iNaT +from pandas.compat import PY3, string_types, text_type, to_str + +from .common import ( + _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, _TD_DTYPE, _string_dtypes, + ensure_int8, ensure_int16, ensure_int32, ensure_int64, ensure_object, + is_bool, is_bool_dtype, is_categorical_dtype, is_complex, is_complex_dtype, + is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, is_datetimelike, is_dtype_equal, + is_extension_array_dtype, is_extension_type, is_float, is_float_dtype, + is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype, + is_timedelta64_dtype, is_timedelta64_ns_dtype, is_unsigned_integer_dtype, + pandas_dtype) +from .dtypes import ( + DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype, PeriodDtype) +from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries from .inference import is_list_like +from .missing import isna, notna _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -164,6 +152,12 @@ def trans(x): # noqa result = to_datetime(result).tz_localize('utc') result = result.tz_convert(dtype.tz) + elif dtype.type == Period: + # TODO(DatetimeArray): merge with previous elif + from pandas.core.arrays import PeriodArray + + return PeriodArray(result, freq=dtype.freq) + except Exception: pass @@ -269,29 +263,11 @@ def maybe_promote(dtype, fill_value=np.nan): fill_value = np.nan # returns tuple of (dtype, fill_value) - if issubclass(dtype.type, (np.datetime64, np.timedelta64)): - # for now: refuse to upcast datetime64 - # (this is because datetime64 will not implicitly upconvert - # to object correctly as of numpy 1.6.1) - if isna(fill_value): - fill_value = iNaT - else: - if issubclass(dtype.type, np.datetime64): - try: - fill_value = tslibs.Timestamp(fill_value).value - except Exception: - # the proper thing to do here would probably be to upcast - # to object (but numpy 1.6.1 doesn't do this properly) - fill_value = iNaT - elif issubclass(dtype.type, np.timedelta64): - try: - fill_value = tslibs.Timedelta(fill_value).value - except Exception: - # as for datetimes, cannot upcast to object - fill_value = iNaT - else: - fill_value = iNaT - elif is_datetimetz(dtype): + if issubclass(dtype.type, np.datetime64): + fill_value = tslibs.Timestamp(fill_value).value + elif issubclass(dtype.type, np.timedelta64): + fill_value = tslibs.Timedelta(fill_value).value + elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = iNaT elif is_extension_array_dtype(dtype) and isna(fill_value): @@ -334,7 +310,7 @@ def maybe_promote(dtype, fill_value=np.nan): # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass - elif is_datetimetz(dtype): + elif is_datetime64tz_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, string_types): dtype = np.object_ @@ -570,34 +546,6 @@ def invalidate_string_dtypes(dtype_set): raise TypeError("string dtypes are not allowed, use 'object' instead") -def maybe_convert_string_to_object(values): - """ - - Convert string-like and string-like array to convert object dtype. - This is to avoid numpy to handle the array as str dtype. - """ - if isinstance(values, string_types): - values = np.array([values], dtype=object) - elif (isinstance(values, np.ndarray) and - issubclass(values.dtype.type, (np.string_, np.unicode_))): - values = values.astype(object) - return values - - -def maybe_convert_scalar(values): - """ - Convert a python scalar to the appropriate numpy dtype if possible - This avoids numpy directly converting according to platform preferences - """ - if is_scalar(values): - dtype, values = infer_dtype_from_scalar(values) - try: - values = dtype(values) - except TypeError: - pass - return values - - def coerce_indexer_dtype(indexer, categories): """ coerce the indexer input array to the smallest dtype possible """ length = len(categories) @@ -645,9 +593,9 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy=True): - """ return a view if copy is False, but - need to be very careful as the result shape could change! +def astype_nansafe(arr, dtype, copy=True, skipna=False): + """ + Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- @@ -655,7 +603,14 @@ def astype_nansafe(arr, dtype, copy=True): dtype : np.dtype copy : bool, default True If False, a view will be attempted but may fail, if - e.g. the itemsizes don't align. + e.g. the item sizes don't align. + skipna: bool, default False + Whether or not we should skip NaN when casting as a string-type. + + Raises + ------ + ValueError + The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ # dispatch on extension dtype if needed @@ -668,10 +623,12 @@ def astype_nansafe(arr, dtype, copy=True): if issubclass(dtype.type, text_type): # in Py3 that's str, in Py2 that's unicode - return lib.astype_unicode(arr.ravel()).reshape(arr.shape) + return lib.astype_unicode(arr.ravel(), + skipna=skipna).reshape(arr.shape) elif issubclass(dtype.type, string_types): - return lib.astype_str(arr.ravel()).reshape(arr.shape) + return lib.astype_str(arr.ravel(), + skipna=skipna).reshape(arr.shape) elif is_datetime64_dtype(arr): if is_object_dtype(dtype): @@ -735,12 +692,9 @@ def astype_nansafe(arr, dtype, copy=True): return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): - msg = ("Passing in '{dtype}' dtype with no frequency is " - "deprecated and will raise in a future version. " + msg = ("The '{dtype}' dtype has no unit. " "Please pass in '{dtype}[ns]' instead.") - warnings.warn(msg.format(dtype=dtype.name), - FutureWarning, stacklevel=5) - dtype = np.dtype(dtype.name + "[ns]") + raise ValueError(msg.format(dtype=dtype.name)) if copy or is_object_dtype(arr) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. @@ -1009,16 +963,14 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): if is_datetime64 or is_datetime64tz or is_timedelta64: - # force the dtype if needed - msg = ("Passing in '{dtype}' dtype with no frequency is " - "deprecated and will raise in a future version. " + # Force the dtype if needed. + msg = ("The '{dtype}' dtype has no unit. " "Please pass in '{dtype}[ns]' instead.") if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): if dtype.name in ('datetime64', 'datetime64[ns]'): if dtype.name == 'datetime64': - warnings.warn(msg.format(dtype=dtype.name), - FutureWarning, stacklevel=5) + raise ValueError(msg.format(dtype=dtype.name)) dtype = _NS_DTYPE else: raise TypeError("cannot convert datetimelike to " @@ -1034,8 +986,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): if dtype.name in ('timedelta64', 'timedelta64[ns]'): if dtype.name == 'timedelta64': - warnings.warn(msg.format(dtype=dtype.name), - FutureWarning, stacklevel=5) + raise ValueError(msg.format(dtype=dtype.name)) dtype = _TD_DTYPE else: raise TypeError("cannot convert timedeltalike to " @@ -1209,7 +1160,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): np.ndarray / pandas type of length, filled with value """ - if is_datetimetz(dtype): + if is_datetime64tz_dtype(dtype): from pandas import DatetimeIndex subarr = DatetimeIndex([value] * length, dtype=dtype) elif is_categorical_dtype(dtype): @@ -1219,11 +1170,16 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype - # coerce if we have nan for an integer dtype - # GH 22858: only cast to float if an index - # (passed here as length) is specified if length and is_integer_dtype(dtype) and isna(value): - dtype = np.float64 + # coerce if we have nan for an integer dtype + dtype = np.dtype('float64') + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): + # we need to coerce to object dtype to avoid + # to allow numpy to take our string as a scalar value + dtype = object + if not isna(value): + value = to_str(value) + subarr = np.empty(length, dtype=dtype) subarr.fill(value) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a9fc9d13d4ab3..e1141c6b6b3a8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1,27 +1,26 @@ """ common type operations """ +import warnings import numpy as np -from pandas.compat import (string_types, text_type, binary_type, - PY3, PY36) + from pandas._libs import algos, lib -from pandas._libs.tslibs import conversion, Period, Timestamp from pandas._libs.interval import Interval +from pandas._libs.tslibs import Period, Timestamp, conversion +from pandas.compat import PY3, PY36, binary_type, string_types, text_type from pandas.core.dtypes.dtypes import ( - registry, CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, - PeriodDtype, IntervalDtype, - PandasExtensionDtype, ExtensionDtype, - _pandas_registry) + CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, ExtensionDtype, + IntervalDtype, PandasExtensionDtype, PeriodDtype, _pandas_registry, + registry) from pandas.core.dtypes.generic import ( - ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, - ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass, - ABCDateOffset) + ABCCategorical, ABCCategoricalIndex, ABCDateOffset, ABCDatetimeIndex, + ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, ABCSeries, ABCSparseArray, + ABCSparseSeries) from pandas.core.dtypes.inference import ( # noqa:F401 - is_bool, is_integer, is_hashable, is_iterator, is_float, - is_dict_like, is_scalar, is_string_like, is_list_like, is_number, - is_file_like, is_re, is_re_compilable, is_sequence, is_nested_list_like, - is_named_tuple, is_array_like, is_decimal, is_complex, is_interval) - + is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like, + is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like, + is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable, + is_scalar, is_sequence, is_string_like) _POSSIBLY_CAST_DTYPES = {np.dtype(t).name for t in ['O', 'int8', 'uint8', 'int16', 'uint16', @@ -153,35 +152,64 @@ def is_object_dtype(arr_or_dtype): def is_sparse(arr): """ - Check whether an array-like is a pandas sparse array. + Check whether an array-like is a 1-D pandas sparse array. + + Check that the one-dimensional array-like is a pandas sparse array. + Returns True if it is a pandas sparse array, not another type of + sparse array. Parameters ---------- arr : array-like - The array-like to check. + Array-like to check. Returns ------- - boolean : Whether or not the array-like is a pandas sparse array. + bool + Whether or not the array-like is a pandas sparse array. + + See Also + -------- + DataFrame.to_sparse : Convert DataFrame to a SparseDataFrame. + Series.to_sparse : Convert Series to SparseSeries. + Series.to_dense : Return dense representation of a Series. Examples -------- - >>> is_sparse(np.array([1, 2, 3])) - False - >>> is_sparse(pd.SparseArray([1, 2, 3])) + Returns `True` if the parameter is a 1-D pandas sparse array. + + >>> is_sparse(pd.SparseArray([0, 0, 1, 0])) True - >>> is_sparse(pd.SparseSeries([1, 2, 3])) + >>> is_sparse(pd.SparseSeries([0, 0, 1, 0])) True - This function checks only for pandas sparse array instances, so - sparse arrays from other libraries will return False. + Returns `False` if the parameter is not sparse. + + >>> is_sparse(np.array([0, 0, 1, 0])) + False + >>> is_sparse(pd.Series([0, 1, 0, 0])) + False + + Returns `False` if the parameter is not a pandas sparse array. >>> from scipy.sparse import bsr_matrix - >>> is_sparse(bsr_matrix([1, 2, 3])) + >>> is_sparse(bsr_matrix([0, 1, 0, 0])) + False + + Returns `False` if the parameter has more than one dimension. + + >>> df = pd.SparseDataFrame([389., 24., 80.5, np.nan], + columns=['max_speed'], + index=['falcon', 'parrot', 'lion', 'monkey']) + >>> is_sparse(df) False + >>> is_sparse(df.max_speed) + True """ + from pandas.core.arrays.sparse import SparseDtype - return isinstance(arr, (ABCSparseArray, ABCSparseSeries)) + dtype = getattr(arr, 'dtype', arr) + return isinstance(dtype, SparseDtype) def is_scipy_sparse(arr): @@ -261,6 +289,8 @@ def is_datetimetz(arr): Check whether an array-like is a datetime array-like with a timezone component in its dtype. + .. deprecated:: 0.24.0 + Parameters ---------- arr : array-like @@ -294,12 +324,10 @@ def is_datetimetz(arr): True """ - # TODO: do we need this function? - # It seems like a repeat of is_datetime64tz_dtype. - - return ((isinstance(arr, ABCDatetimeIndex) and - getattr(arr, 'tz', None) is not None) or - is_datetime64tz_dtype(arr)) + warnings.warn("'is_datetimetz' is deprecated and will be removed in a " + "future version. Use 'is_datetime64tz_dtype' instead.", + FutureWarning, stacklevel=2) + return is_datetime64tz_dtype(arr) def is_offsetlike(arr_or_obj): @@ -337,6 +365,8 @@ def is_period(arr): """ Check whether an array-like is a periodical index. + .. deprecated:: 0.24.0 + Parameters ---------- arr : array-like @@ -356,8 +386,10 @@ def is_period(arr): True """ - # TODO: do we need this function? - # It seems like a repeat of is_period_arraylike. + warnings.warn("'is_period' is deprecated and will be removed in a future " + "version. Use 'is_period_dtype' or is_period_arraylike' " + "instead.", FutureWarning, stacklevel=2) + return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) @@ -393,7 +425,7 @@ def is_datetime64_dtype(arr_or_dtype): return False try: tipo = _get_dtype_type(arr_or_dtype) - except TypeError: + except (TypeError, UnicodeEncodeError): return False return issubclass(tipo, np.datetime64) @@ -638,10 +670,10 @@ def is_period_arraylike(arr): True """ - if isinstance(arr, ABCPeriodIndex): + if isinstance(arr, (ABCPeriodIndex, ABCPeriodArray)): return True elif isinstance(arr, (np.ndarray, ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'period' + return is_period_dtype(arr.dtype) return getattr(arr, 'inferred_type', None) == 'period' @@ -717,8 +749,7 @@ def is_datetimelike(arr): return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or is_timedelta64_dtype(arr) or - isinstance(arr, ABCPeriodIndex) or - is_datetimetz(arr)) + isinstance(arr, ABCPeriodIndex)) def is_dtype_equal(source, target): @@ -796,11 +827,11 @@ def is_dtype_union_equal(source, target): def is_any_int_dtype(arr_or_dtype): """Check whether the provided array or dtype is of an integer dtype. - .. deprecated:: 0.20.0 - In this function, timedelta64 instances are also considered "any-integer" type objects and will return True. + This function is internal and should not be exposed in the public API. + Parameters ---------- arr_or_dtype : array-like @@ -1024,54 +1055,6 @@ def is_int64_dtype(arr_or_dtype): return issubclass(tipo, np.int64) -def is_int_or_datetime_dtype(arr_or_dtype): - """ - Check whether the provided array or dtype is of an - integer, timedelta64, or datetime64 dtype. - - Parameters - ---------- - arr_or_dtype : array-like - The array or dtype to check. - - Returns - ------- - boolean : Whether or not the array or dtype is of an - integer, timedelta64, or datetime64 dtype. - - Examples - -------- - >>> is_int_or_datetime_dtype(str) - False - >>> is_int_or_datetime_dtype(int) - True - >>> is_int_or_datetime_dtype(float) - False - >>> is_int_or_datetime_dtype(np.uint64) - True - >>> is_int_or_datetime_dtype(np.datetime64) - True - >>> is_int_or_datetime_dtype(np.timedelta64) - True - >>> is_int_or_datetime_dtype(np.array(['a', 'b'])) - False - >>> is_int_or_datetime_dtype(pd.Series([1, 2])) - True - >>> is_int_or_datetime_dtype(np.array([], dtype=np.timedelta64)) - True - >>> is_int_or_datetime_dtype(np.array([], dtype=np.datetime64)) - True - >>> is_int_or_datetime_dtype(pd.Index([1, 2.])) # float - False - """ - - if arr_or_dtype is None: - return False - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) or - issubclass(tipo, (np.datetime64, np.timedelta64))) - - def is_datetime64_any_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of the datetime64 dtype. @@ -1560,6 +1543,8 @@ def is_float_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of a float dtype. + This function is internal and should not be exposed in the public API. + Parameters ---------- arr_or_dtype : array-like @@ -1591,22 +1576,6 @@ def is_float_dtype(arr_or_dtype): return issubclass(tipo, np.floating) -def is_floating_dtype(arr_or_dtype): - """Check whether the provided array or dtype is an instance of - numpy's float dtype. - - .. deprecated:: 0.20.0 - - Unlike, `is_float_dtype`, this check is a lot stricter, as it requires - `isinstance` of `np.floating` and not `issubclass`. - """ - - if arr_or_dtype is None: - return False - tipo = _get_dtype_type(arr_or_dtype) - return isinstance(tipo, np.floating) - - def is_bool_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of a boolean dtype. @@ -1643,8 +1612,9 @@ def is_bool_dtype(arr_or_dtype): True >>> is_bool_dtype(pd.Categorical([True, False])) True + >>> is_bool_dtype(pd.SparseArray([True, False])) + True """ - if arr_or_dtype is None: return False try: @@ -1729,7 +1699,7 @@ def is_extension_type(arr): return True elif is_sparse(arr): return True - elif is_datetimetz(arr): + elif is_datetime64tz_dtype(arr): return True return False @@ -1751,6 +1721,8 @@ def is_extension_array_dtype(arr_or_dtype): array interface. In pandas, this includes: * Categorical + * Sparse + * Interval Third-party libraries may implement arrays or types satisfying this interface as well. @@ -1795,38 +1767,6 @@ def is_complex_dtype(arr_or_dtype): return issubclass(tipo, np.complexfloating) -def _coerce_to_dtype(dtype): - """ - Coerce a string or np.dtype to a pandas or numpy - dtype if possible. - - If we cannot convert to a pandas dtype initially, - we convert to a numpy dtype. - - Parameters - ---------- - dtype : The dtype that we want to coerce. - - Returns - ------- - pd_or_np_dtype : The coerced dtype. - """ - - if is_categorical_dtype(dtype): - categories = getattr(dtype, 'categories', None) - ordered = getattr(dtype, 'ordered', False) - dtype = CategoricalDtype(categories=categories, ordered=ordered) - elif is_datetime64tz_dtype(dtype): - dtype = DatetimeTZDtype(dtype) - elif is_period_dtype(dtype): - dtype = PeriodDtype(dtype) - elif is_interval_dtype(dtype): - dtype = IntervalDtype(dtype) - else: - dtype = np.dtype(dtype) - return dtype - - def _get_dtype(arr_or_dtype): """ Get the dtype instance associated with an array @@ -1873,7 +1813,8 @@ def _get_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) elif is_interval_dtype(arr_or_dtype): return IntervalDtype.construct_from_string(arr_or_dtype) - elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)): + elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex, + ABCSparseArray, ABCSparseSeries)): return arr_or_dtype.dtype if hasattr(arr_or_dtype, 'dtype'): @@ -1921,6 +1862,13 @@ def _get_dtype_type(arr_or_dtype): elif is_interval_dtype(arr_or_dtype): return Interval return _get_dtype_type(np.dtype(arr_or_dtype)) + else: + from pandas.core.arrays.sparse import SparseDtype + if isinstance(arr_or_dtype, (ABCSparseSeries, + ABCSparseArray, + SparseDtype)): + dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) + return dtype.type try: return arr_or_dtype.dtype.type except AttributeError: @@ -1952,7 +1900,7 @@ def _get_dtype_from_object(dtype): return dtype elif is_categorical(dtype): return CategoricalDtype().type - elif is_datetimetz(dtype): + elif is_datetime64tz_dtype(dtype): return DatetimeTZDtype(dtype).type elif isinstance(dtype, np.dtype): # dtype object try: @@ -2028,7 +1976,6 @@ def pandas_dtype(dtype): Raises ------ TypeError if not a dtype - """ # short-circuit if isinstance(dtype, np.ndarray): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index c1aab961dcc9f..0df0c01dbd47a 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -3,25 +3,19 @@ """ import numpy as np + from pandas._libs import tslib, tslibs -from pandas import compat + from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_sparse, - is_extension_array_dtype, - is_datetimetz, - is_datetime64_dtype, - is_timedelta64_dtype, - is_period_dtype, - is_object_dtype, - is_bool_dtype, - is_interval_dtype, - is_dtype_equal, - _NS_DTYPE, - _TD_DTYPE) + _NS_DTYPE, _TD_DTYPE, is_bool_dtype, is_categorical_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, + is_extension_array_dtype, is_interval_dtype, is_object_dtype, + is_period_dtype, is_sparse, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCTimedeltaIndex, - ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame) + ABCDatetimeIndex, ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame, + ABCTimedeltaIndex) + +from pandas import compat def get_dtype_kinds(l): @@ -45,7 +39,7 @@ def get_dtype_kinds(l): typ = 'sparse' elif isinstance(arr, ABCRangeIndex): typ = 'range' - elif is_datetimetz(arr): + elif is_datetime64tz_dtype(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) @@ -93,11 +87,13 @@ def _get_series_result_type(result, objs=None): def _get_frame_result_type(result, objs): """ return appropriate class of DataFrame-like concat - if all blocks are SparseBlock, return SparseDataFrame + if all blocks are sparse, return SparseDataFrame otherwise, return 1st obj """ - if result.blocks and all(b.is_sparse for b in result.blocks): + if (result.blocks and ( + all(is_sparse(b) for b in result.blocks) or + all(isinstance(obj, ABCSparseDataFrame) for obj in objs))): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: @@ -105,27 +101,6 @@ def _get_frame_result_type(result, objs): ABCSparseDataFrame)) -def _get_sliced_frame_result_type(data, obj): - """ - return appropriate class of Series. When data is sparse - it will return a SparseSeries, otherwise it will return - the Series. - - Parameters - ---------- - data : array-like - obj : DataFrame - - Returns - ------- - Series or SparseSeries - """ - if is_sparse(data): - from pandas.core.sparse.api import SparseSeries - return SparseSeries - return obj._constructor_sliced - - def _concat_compat(to_concat, axis=0): """ provide concatenation of an array of arrays each of which is a single @@ -216,15 +191,6 @@ def _concat_categorical(to_concat, axis=0): A single array, preserving the combined dtypes """ - def _concat_asobject(to_concat): - to_concat = [x.get_values() if is_categorical_dtype(x.dtype) - else np.asarray(x).ravel() for x in to_concat] - res = _concat_compat(to_concat) - if axis == 1: - return res.reshape(1, len(res)) - else: - return res - # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical @@ -239,7 +205,14 @@ def _concat_asobject(to_concat): if all(first.is_dtype_equal(other) for other in to_concat[1:]): return union_categoricals(categoricals) - return _concat_asobject(to_concat) + # extract the categoricals & coerce to object if needed + to_concat = [x.get_values() if is_categorical_dtype(x.dtype) + else np.asarray(x).ravel() if not is_datetime64tz_dtype(x) + else np.asarray(x.astype(object)) for x in to_concat] + result = _concat_compat(to_concat) + if axis == 1: + result = result.reshape(1, len(result)) + return result def union_categoricals(to_union, sort_categories=False, ignore_order=False): @@ -256,7 +229,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. - ignore_order: boolean, default False + ignore_order : boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. @@ -398,10 +371,8 @@ def _maybe_unwrap(x): if sort_categories: categories = categories.sort_values() - new_codes = [] - for c in to_union: - new_codes.append(_recode_for_categories(c.codes, c.categories, - categories)) + new_codes = [_recode_for_categories(c.codes, c.categories, categories) + for c in to_union] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message @@ -468,10 +439,10 @@ def _concat_datetime(to_concat, axis=0, typs=None): axis=axis).view(_TD_DTYPE) elif any(typ.startswith('period') for typ in typs): - # PeriodIndex must be handled by PeriodIndex, - # Thus can't meet this condition ATM - # Must be changed when we adding PeriodDtype - raise NotImplementedError("unable to concat PeriodDtype") + assert len(typs) == 1 + cls = to_concat[0] + new_values = cls._concat_same_type(to_concat) + return new_values def _convert_datetimelike_to_object(x): @@ -501,13 +472,7 @@ def _concat_datetimetz(to_concat, name=None): all inputs must be DatetimeIndex it is used in DatetimeIndex.append also """ - # do not pass tz to set because tzlocal cannot be hashed - if len({str(x.dtype) for x in to_concat}) != 1: - raise ValueError('to_concat must have the same tz') - tz = to_concat[0].tz - # no need to localize because internal repr will not be changed - new_values = np.concatenate([x.asi8 for x in to_concat]) - return to_concat[0]._simple_new(new_values, tz=tz, name=name) + return to_concat[0]._concat_same_dtype(to_concat, name=name) def _concat_index_same_dtype(indexes, klass=None): @@ -554,61 +519,18 @@ def _concat_sparse(to_concat, axis=0, typs=None): a single array, preserving the combined dtypes """ - from pandas.core.sparse.array import SparseArray, _make_index + from pandas.core.arrays import SparseArray - def convert_sparse(x, axis): - # coerce to native type - if isinstance(x, SparseArray): - x = x.get_values() - else: - x = np.asarray(x) - x = x.ravel() - if axis > 0: - x = np.atleast_2d(x) - return x + fill_values = [x.fill_value for x in to_concat + if isinstance(x, SparseArray)] + fill_value = fill_values[0] - if typs is None: - typs = get_dtype_kinds(to_concat) + # TODO: Fix join unit generation so we aren't passed this. + to_concat = [x if isinstance(x, SparseArray) + else SparseArray(x.squeeze(), fill_value=fill_value) + for x in to_concat] - if len(typs) == 1: - # concat input as it is if all inputs are sparse - # and have the same fill_value - fill_values = {c.fill_value for c in to_concat} - if len(fill_values) == 1: - sp_values = [c.sp_values for c in to_concat] - indexes = [c.sp_index.to_int_index() for c in to_concat] - - indices = [] - loc = 0 - for idx in indexes: - indices.append(idx.indices + loc) - loc += idx.length - sp_values = np.concatenate(sp_values) - indices = np.concatenate(indices) - sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index) - - return SparseArray(sp_values, sparse_index=sp_index, - fill_value=to_concat[0].fill_value) - - # input may be sparse / dense mixed and may have different fill_value - # input must contain sparse at least 1 - sparses = [c for c in to_concat if is_sparse(c)] - fill_values = [c.fill_value for c in sparses] - sp_indexes = [c.sp_index for c in sparses] - - # densify and regular concat - to_concat = [convert_sparse(x, axis) for x in to_concat] - result = np.concatenate(to_concat, axis=axis) - - if not len(typs - {'sparse', 'f', 'i'}): - # sparsify if inputs are sparse and dense numerics - # first sparse input's fill_value and SparseIndex is used - result = SparseArray(result.ravel(), fill_value=fill_values[0], - kind=sp_indexes[0]) - else: - # coerce to object if needed - result = result.astype('object') - return result + return SparseArray._concat_same_type(to_concat) def _concat_rangeindex_same_dtype(indexes): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index beda9bc02f4d5..82f931c1469b7 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1,11 +1,16 @@ """ define extension dtypes """ - import re +import warnings + import numpy as np -from pandas import compat -from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex -from pandas._libs.tslibs import Period, NaT, Timestamp +import pytz + from pandas._libs.interval import Interval +from pandas._libs.tslibs import NaT, Period, Timestamp, timezones + +from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass + +from pandas import compat from .base import ExtensionDtype, _DtypeOpsMixin @@ -101,7 +106,6 @@ class PandasExtensionDtype(_DtypeOpsMixin): base = None isbuiltin = 0 isnative = 0 - _metadata = [] _cache = {} def __unicode__(self): @@ -209,7 +213,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = ['categories', 'ordered'] + _metadata = ('categories', 'ordered') _cache = {} def __init__(self, categories=None, ordered=None): @@ -335,16 +339,12 @@ def _hash_categories(categories, ordered=True): cat_array = [cat_array] hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) - if len(hashed) == 0: - # bug in Numpy<1.12 for length 0 arrays. Just return the correct - # value of 0 - return 0 - else: - return np.bitwise_xor.reduce(hashed) + return np.bitwise_xor.reduce(hashed) @classmethod def construct_array_type(cls): - """Return the array type associated with this dtype + """ + Return the array type associated with this dtype Returns ------- @@ -355,7 +355,8 @@ def construct_array_type(cls): @classmethod def construct_from_string(cls, string): - """ attempt to construct this type from a string, raise a TypeError if + """ + attempt to construct this type from a string, raise a TypeError if it's not possible """ try: if string == 'category': @@ -461,7 +462,9 @@ def categories(self): @property def ordered(self): - """Whether the categories have an ordered relationship""" + """ + Whether the categories have an ordered relationship + """ return self._ordered @property @@ -485,88 +488,125 @@ class DatetimeTZDtype(PandasExtensionDtype): str = '|M8[ns]' num = 101 base = np.dtype('M8[ns]') - _metadata = ['unit', 'tz'] + _metadata = ('unit', 'tz') _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache = {} - def __new__(cls, unit=None, tz=None): - """ Create a new unit if needed, otherwise return from the cache + def __init__(self, unit="ns", tz=None): + """ + An ExtensionDtype for timezone-aware datetime data. Parameters ---------- - unit : string unit that this represents, currently must be 'ns' - tz : string tz that this represents - """ + unit : str, default "ns" + The precision of the datetime data. Currently limited + to ``"ns"``. + tz : str, int, or datetime.tzinfo + The timezone. + Raises + ------ + pytz.UnknownTimeZoneError + When the requested timezone cannot be found. + + Examples + -------- + >>> pd.core.dtypes.dtypes.DatetimeTZDtype(tz='UTC') + datetime64[ns, UTC] + + >>> pd.core.dtypes.dtypes.DatetimeTZDtype(tz='dateutil/US/Central') + datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] + """ if isinstance(unit, DatetimeTZDtype): unit, tz = unit.unit, unit.tz - elif unit is None: - # we are called as an empty constructor - # generally for pickle compat - return object.__new__(cls) + if unit != 'ns': + if isinstance(unit, compat.string_types) and tz is None: + # maybe a string like datetime64[ns, tz], which we support for + # now. + result = type(self).construct_from_string(unit) + unit = result.unit + tz = result.tz + msg = ( + "Passing a dtype alias like 'datetime64[ns, {tz}]' " + "to DatetimeTZDtype is deprecated. Use " + "'DatetimeTZDtype.construct_from_string()' instead." + ) + warnings.warn(msg.format(tz=tz), FutureWarning, stacklevel=2) + else: + raise ValueError("DatetimeTZDtype only supports ns units") + if tz: + tz = timezones.maybe_get_tz(tz) + elif tz is not None: + raise pytz.UnknownTimeZoneError(tz) elif tz is None: + raise TypeError("A 'tz' is required.") - # we were passed a string that we can construct - try: - m = cls._match.search(unit) - if m is not None: - unit = m.groupdict()['unit'] - tz = m.groupdict()['tz'] - except TypeError: - raise ValueError("could not construct DatetimeTZDtype") - - elif isinstance(unit, compat.string_types): + self._unit = unit + self._tz = tz - if unit != 'ns': - raise ValueError("DatetimeTZDtype only supports ns units") - - unit = unit - tz = tz + @property + def unit(self): + """The precision of the datetime data.""" + return self._unit - if tz is None: - raise ValueError("DatetimeTZDtype constructor must have a tz " - "supplied") + @property + def tz(self): + """The timezone.""" + return self._tz - # hash with the actual tz if we can - # some cannot be hashed, so stringfy - try: - key = (unit, tz) - hash(key) - except TypeError: - key = (unit, str(tz)) + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype - # set/retrieve from cache - try: - return cls._cache[key] - except KeyError: - u = object.__new__(cls) - u.unit = unit - u.tz = tz - cls._cache[key] = u - return u + Returns + ------- + type + """ + from pandas import DatetimeIndex + return DatetimeIndex @classmethod def construct_from_string(cls, string): - """ attempt to construct this type from a string, raise a TypeError if - it's not possible """ + Construct a DatetimeTZDtype from a string. + + Parameters + ---------- + string : str + The string alias for this DatetimeTZDtype. + Should be formatted like ``datetime64[ns, ]``, + where ```` is the timezone name. + + Examples + -------- + >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]') + datetime64[ns, UTC] + """ + msg = "Could not construct DatetimeTZDtype from '{}'" try: - return cls(unit=string) - except ValueError: - raise TypeError("could not construct DatetimeTZDtype") + match = cls._match.match(string) + if match: + d = match.groupdict() + return cls(unit=d['unit'], tz=d['tz']) + except Exception: + # TODO(py3): Change this pass to `raise TypeError(msg) from e` + pass + raise TypeError(msg.format(string)) def __unicode__(self): - # format the tz return "datetime64[{unit}, {tz}]".format(unit=self.unit, tz=self.tz) @property def name(self): + """A string representation of the dtype.""" return str(self) def __hash__(self): # make myself hashable + # TODO: update this. return hash(str(self)) def __eq__(self, other): @@ -577,8 +617,13 @@ def __eq__(self, other): self.unit == other.unit and str(self.tz) == str(other.tz)) + def __setstate__(self, state): + # for pickle compat. + self._tz = state['tz'] + self._unit = state['unit'] + -class PeriodDtype(PandasExtensionDtype): +class PeriodDtype(ExtensionDtype, PandasExtensionDtype): """ A Period duck-typed class, suitable for holding a period with freq dtype. @@ -589,7 +634,7 @@ class PeriodDtype(PandasExtensionDtype): str = '|O08' base = np.dtype('O') num = 102 - _metadata = ['freq'] + _metadata = ('freq',) _match = re.compile(r"(P|p)eriod\[(?P.+)\]") _cache = {} @@ -696,6 +741,12 @@ def is_dtype(cls, dtype): return False return super(PeriodDtype, cls).is_dtype(dtype) + @classmethod + def construct_array_type(cls): + from pandas.core.arrays import PeriodArray + + return PeriodArray + @register_extension_dtype class IntervalDtype(PandasExtensionDtype, ExtensionDtype): @@ -709,7 +760,7 @@ class IntervalDtype(PandasExtensionDtype, ExtensionDtype): str = '|O08' base = np.dtype('O') num = 103 - _metadata = ['subtype'] + _metadata = ('subtype',) _match = re.compile(r"(I|i)nterval\[(?P.+)\]") _cache = {} @@ -760,7 +811,8 @@ def __new__(cls, subtype=None): @classmethod def construct_array_type(cls): - """Return the array type associated with this dtype + """ + Return the array type associated with this dtype Returns ------- diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index cb54c94d29205..7a3ff5d295421 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -53,12 +53,21 @@ def _check(cls, inst): ('sparse_array', 'sparse_series')) ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) +ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", + ("datetimearray")) +ABCTimedeltaArray = create_pandas_abc_type("ABCTimedeltaArray", "_typ", + ("timedeltaarray")) +ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", + ("periodarray", )) ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period", )) ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", )) ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ", - ("extension", "categorical",)) + ("extension", + "categorical", + "periodarray", + )) class _ABCGeneric(type): diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 67f391615eedb..241a1b471f677 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -1,12 +1,15 @@ """ basic inference routines """ +from numbers import Number import re + import numpy as np -from numbers import Number -from pandas import compat -from pandas.compat import (PY2, string_types, text_type, - string_and_binary_types, re_type) + from pandas._libs import lib +from pandas.compat import ( + PY2, Set, re_type, string_and_binary_types, string_types, text_type) + +from pandas import compat is_bool = lib.is_bool @@ -41,7 +44,7 @@ def is_number(obj): See Also -------- - pandas.api.types.is_integer: checks a subgroup of numbers + pandas.api.types.is_integer: Checks a subgroup of numbers. Examples -------- @@ -70,7 +73,7 @@ def is_string_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Examples -------- @@ -124,7 +127,7 @@ def is_iterator(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -169,7 +172,7 @@ def is_file_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -200,7 +203,7 @@ def is_re(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -224,7 +227,7 @@ def is_re_compilable(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -247,7 +250,7 @@ def is_re_compilable(obj): return True -def is_list_like(obj): +def is_list_like(obj, allow_sets=True): """ Check if the object is list-like. @@ -258,7 +261,11 @@ def is_list_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check + allow_sets : boolean, default True + If this parameter is False, sets will not be considered list-like + + .. versionadded:: 0.24.0 Returns ------- @@ -283,11 +290,15 @@ def is_list_like(obj): False """ - return (isinstance(obj, compat.Iterable) and + return (isinstance(obj, compat.Iterable) # we do not count strings/unicode/bytes as list-like - not isinstance(obj, string_and_binary_types) and + and not isinstance(obj, string_and_binary_types) + # exclude zero-dimensional numpy arrays, effectively scalars - not (isinstance(obj, np.ndarray) and obj.ndim == 0)) + and not (isinstance(obj, np.ndarray) and obj.ndim == 0) + + # exclude sets if allow_sets is False + and not (allow_sets is False and isinstance(obj, Set))) def is_array_like(obj): @@ -299,7 +310,7 @@ def is_array_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -332,7 +343,7 @@ def is_nested_list_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -373,7 +384,7 @@ def is_dict_like(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -397,7 +408,7 @@ def is_named_tuple(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- @@ -457,7 +468,7 @@ def is_sequence(obj): Parameters ---------- - obj : The object to check. + obj : The object to check Returns ------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 66998aa6866f6..809dcbd054ea0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -2,27 +2,19 @@ missing types & inference """ import numpy as np + from pandas._libs import lib, missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from .generic import (ABCMultiIndex, ABCSeries, - ABCIndexClass, ABCGeneric, - ABCExtensionArray) -from .common import (is_string_dtype, is_datetimelike, - is_datetimelike_v_numeric, is_float_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, - is_period_dtype, - is_complex_dtype, - is_string_like_dtype, is_bool_dtype, - is_integer_dtype, is_dtype_equal, - is_extension_array_dtype, - needs_i8_conversion, ensure_object, - pandas_dtype, - is_scalar, - is_object_dtype, - is_integer, - _TD_DTYPE, - _NS_DTYPE) + +from .common import ( + _NS_DTYPE, _TD_DTYPE, ensure_object, is_bool_dtype, is_complex_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, + is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, + is_float_dtype, is_integer, is_integer_dtype, is_object_dtype, + is_period_dtype, is_scalar, is_string_dtype, is_string_like_dtype, + is_timedelta64_dtype, needs_i8_conversion, pandas_dtype) +from .generic import ( + ABCExtensionArray, ABCGeneric, ABCIndexClass, ABCMultiIndex, ABCSeries) from .inference import is_list_like isposinf_scalar = libmissing.isposinf_scalar @@ -51,7 +43,7 @@ def isna(obj): See Also -------- - notna : boolean inverse of pandas.isna. + notna : Boolean inverse of pandas.isna. Series.isna : Detect missing values in a Series. DataFrame.isna : Detect missing values in a DataFrame. Index.isna : Detect missing values in an Index. @@ -187,10 +179,18 @@ def _use_inf_as_na(key): def _isna_ndarraylike(obj): - values = getattr(obj, 'values', obj) + is_extension = is_extension_array_dtype(obj) + + if not is_extension: + # Avoid accessing `.values` on things like + # PeriodIndex, which may be expensive. + values = getattr(obj, 'values', obj) + else: + values = obj + dtype = values.dtype - if is_extension_array_dtype(obj): + if is_extension: if isinstance(obj, (ABCIndexClass, ABCSeries)): values = obj._values else: @@ -209,7 +209,7 @@ def _isna_ndarraylike(obj): vec = libmissing.isnaobj(values.ravel()) result[...] = vec.reshape(shape) - elif needs_i8_conversion(obj): + elif needs_i8_conversion(dtype): # this is the NaT pattern result = values.view('i8') == iNaT else: @@ -274,7 +274,7 @@ def notna(obj): See Also -------- - isna : boolean inverse of pandas.notna. + isna : Boolean inverse of pandas.notna. Series.notna : Detect valid values in a Series. DataFrame.notna : Detect valid values in a DataFrame. Index.notna : Detect valid values in an Index. @@ -499,6 +499,19 @@ def na_value_for_dtype(dtype, compat=True): Returns ------- np.dtype or a pandas dtype + + Examples + -------- + >>> na_value_for_dtype(np.dtype('int64')) + 0 + >>> na_value_for_dtype(np.dtype('int64'), compat=False) + nan + >>> na_value_for_dtype(np.dtype('float64')) + nan + >>> na_value_for_dtype(np.dtype('bool')) + False + >>> na_value_for_dtype(np.dtype('datetime64[ns]')) + NaT """ dtype = pandas_dtype(dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 986fe347898f5..2a8d58b8867b7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1,3 +1,5 @@ +# pylint: disable=E1101 +# pylint: disable=W0212,W0703,W0622 """ DataFrame --------- @@ -9,11 +11,9 @@ labeling information """ from __future__ import division -# pylint: disable=E1101,E1103 -# pylint: disable=W0212,W0231,W0703,W0622 -import functools import collections +import functools import itertools import sys import warnings @@ -22,11 +22,23 @@ import numpy as np import numpy.ma as ma -from pandas.core.accessor import CachedAccessor +from pandas._libs import lib, algos as libalgos + +from pandas.util._decorators import (Appender, Substitution, + rewrite_axis_style_signature, + deprecate_kwarg) +from pandas.util._validators import (validate_bool_kwarg, + validate_axis_style_args) + +from pandas import compat +from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u, + OrderedDict, PY36, raise_with_traceback, + string_and_binary_types) +from pandas.compat.numpy import function as nv + from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, - construct_1d_arraylike_from_scalar, infer_dtype_from_scalar, maybe_cast_to_datetime, maybe_infer_to_datetimelike, @@ -37,11 +49,9 @@ maybe_upcast_putmask, find_common_type) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_object_dtype, is_extension_type, is_extension_array_dtype, - is_datetimetz, is_datetime64_any_dtype, is_bool_dtype, is_integer_dtype, @@ -59,49 +69,36 @@ is_iterator, is_sequence, is_named_tuple) -from pandas.core.dtypes.concat import _get_sliced_frame_result_type +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex from pandas.core.dtypes.missing import isna, notna - +from pandas.core import algorithms +from pandas.core import common as com +from pandas.core import nanops +from pandas.core import ops +from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import Categorical, ExtensionArray +from pandas.core.config import get_option from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, ensure_index, ensure_index_from_sequences) +from pandas.core.indexes import base as ibase +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) -from pandas.core.internals import (BlockManager, - create_block_manager_from_arrays, - create_block_manager_from_blocks) +from pandas.core.internals import BlockManager +from pandas.core.internals.construction import ( + masked_rec_array_to_mgr, get_names_from_index, to_arrays, + reorder_arrays, init_ndarray, init_dict, + arrays_to_mgr, sanitize_index) from pandas.core.series import Series -from pandas.core.arrays import Categorical, ExtensionArray -import pandas.core.algorithms as algorithms -from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, - OrderedDict, raise_with_traceback, - string_and_binary_types) -from pandas import compat -from pandas.compat import PY36 -from pandas.compat.numpy import function as nv -from pandas.util._decorators import (Appender, Substitution, - rewrite_axis_style_signature, - deprecate_kwarg) -from pandas.util._validators import (validate_bool_kwarg, - validate_axis_style_args) -from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex -import pandas.core.indexes.base as ibase - -import pandas.core.common as com -import pandas.core.nanops as nanops -import pandas.core.ops as ops -import pandas.io.formats.console as console -import pandas.io.formats.format as fmt +from pandas.io.formats import console +from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing -import pandas.plotting._core as gfx - -from pandas._libs import lib, algos as libalgos -from pandas.core.config import get_option +import pandas.plotting._core as gfx # --------------------------------------------------------------------- # Docstring templates @@ -222,9 +219,9 @@ See Also -------- -merge_ordered : merge with optional filling/interpolation. -merge_asof : merge on nearest keys. -DataFrame.join : similar method using indices. +merge_ordered : Merge with optional filling/interpolation. +merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices. Examples -------- @@ -286,7 +283,8 @@ class DataFrame(NDFrame): - """ Two-dimensional size-mutable, potentially heterogeneous tabular data + """ + Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. The primary pandas data structure. @@ -347,12 +345,12 @@ class DataFrame(NDFrame): 1 4 5 6 2 7 8 9 - See also + See Also -------- - DataFrame.from_records : constructor from tuples, also record arrays - DataFrame.from_dict : from dicts of Series, arrays, or dicts - DataFrame.from_items : from sequence of (key, value) pairs - pandas.read_csv, pandas.read_table, pandas.read_clipboard + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + DataFrame.from_items : From sequence of (key, value) pairs + pandas.read_csv, pandas.read_table, pandas.read_clipboard. """ @property @@ -361,7 +359,7 @@ def _constructor(self): _constructor_sliced = Series _deprecations = NDFrame._deprecations | frozenset( - ['sortlevel', 'get_value', 'set_value', 'from_csv', 'from_items']) + ['get_value', 'set_value', 'from_csv', 'from_items']) _accessors = set() @property @@ -369,6 +367,9 @@ def _constructor_expanddim(self): from pandas.core.panel import Panel return Panel + # ---------------------------------------------------------------------- + # Constructors + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if data is None: @@ -383,13 +384,13 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, dict): - mgr = self._init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords # masked recarray if isinstance(data, mrecords.MaskedRecords): - mgr = _masked_rec_array_to_mgr(data, index, columns, dtype, - copy) + mgr = masked_rec_array_to_mgr(data, index, columns, dtype, + copy) # a masked array else: @@ -399,8 +400,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data[mask] = fill_value else: data = data.copy() - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, + copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -408,13 +409,13 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns - mgr = self._init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype) elif getattr(data, 'name', None) is not None: - mgr = self._init_dict({data.name: data}, index, columns, - dtype=dtype) + mgr = init_dict({data.name: data}, index, columns, + dtype=dtype) else: - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, + copy=copy) # For data is list-like, or Iterable (will consume into list) elif (isinstance(data, compat.Iterable) @@ -425,25 +426,25 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields - arrays, columns = _to_arrays(data, columns, dtype=dtype) + arrays, columns = to_arrays(data, columns, dtype=dtype) columns = ensure_index(columns) # set the index if index is None: if isinstance(data[0], Series): - index = _get_names_from_index(data) + index = get_names_from_index(data) elif isinstance(data[0], Categorical): index = ibase.default_index(len(data[0])) else: index = ibase.default_index(len(data)) - mgr = _arrays_to_mgr(arrays, columns, index, columns, - dtype=dtype) + mgr = arrays_to_mgr(arrays, columns, index, columns, + dtype=dtype) else: - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, + copy=copy) else: - mgr = self._init_dict({}, index, columns, dtype=dtype) + mgr = init_dict({}, index, columns, dtype=dtype) else: try: arr = np.array(data, dtype=dtype, copy=copy) @@ -455,124 +456,14 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if arr.ndim == 0 and index is not None and columns is not None: values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype) - mgr = self._init_ndarray(values, index, columns, - dtype=values.dtype, copy=False) + mgr = init_ndarray(values, index, columns, + dtype=values.dtype, copy=False) else: raise ValueError('DataFrame constructor not properly called!') NDFrame.__init__(self, mgr, fastpath=True) - def _init_dict(self, data, index, columns, dtype=None): - """ - Segregate Series based on type and coerce into matrices. - Needs to handle a lot of exceptional cases. - """ - if columns is not None: - arrays = Series(data, index=columns, dtype=object) - data_names = arrays.index - - missing = arrays.isnull() - if index is None: - # GH10856 - # raise ValueError if only scalars in dict - index = extract_index(arrays[~missing]) - else: - index = ensure_index(index) - - # no obvious "empty" int column - if missing.any() and not is_integer_dtype(dtype): - if dtype is None or np.issubdtype(dtype, np.flexible): - # 1783 - nan_dtype = object - else: - nan_dtype = dtype - v = construct_1d_arraylike_from_scalar(np.nan, len(index), - nan_dtype) - arrays.loc[missing] = [v] * missing.sum() - - else: - keys = com.dict_keys_to_ordered_list(data) - columns = data_names = Index(keys) - arrays = [data[k] for k in keys] - - return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) - - def _init_ndarray(self, values, index, columns, dtype=None, copy=False): - # input must be a ndarray, list, Series, index - - if isinstance(values, Series): - if columns is None: - if values.name is not None: - columns = [values.name] - if index is None: - index = values.index - else: - values = values.reindex(index) - - # zero len case (GH #2234) - if not len(values) and columns is not None and len(columns): - values = np.empty((0, 1), dtype=object) - - # helper to create the axes as indexes - def _get_axes(N, K, index=index, columns=columns): - # return axes or defaults - - if index is None: - index = ibase.default_index(N) - else: - index = ensure_index(index) - - if columns is None: - columns = ibase.default_index(K) - else: - columns = ensure_index(columns) - return index, columns - - # we could have a categorical type passed or coerced to 'category' - # recast this to an _arrays_to_mgr - if (is_categorical_dtype(getattr(values, 'dtype', None)) or - is_categorical_dtype(dtype)): - - if not hasattr(values, 'dtype'): - values = _prep_ndarray(values, copy=copy) - values = values.ravel() - elif copy: - values = values.copy() - - index, columns = _get_axes(len(values), 1) - return _arrays_to_mgr([values], columns, index, columns, - dtype=dtype) - elif (is_datetimetz(values) or is_extension_array_dtype(values)): - # GH19157 - if columns is None: - columns = [0] - return _arrays_to_mgr([values], columns, index, columns, - dtype=dtype) - - # by definition an array here - # the dtypes will be coerced to a single dtype - values = _prep_ndarray(values, copy=copy) - - if dtype is not None: - if not is_dtype_equal(values.dtype, dtype): - try: - values = values.astype(dtype) - except Exception as orig: - e = ValueError("failed to cast to '{dtype}' (Exception " - "was: {orig})".format(dtype=dtype, - orig=orig)) - raise_with_traceback(e) - - index, columns = _get_axes(*values.shape) - values = values.T - - # if we don't have a dtype specified, then try to convert objects - # on the entire block; this is to convert if we have datetimelike's - # embedded in an object type - if dtype is None and is_object_dtype(values): - values = maybe_infer_to_datetimelike(values) - - return create_block_manager_from_blocks([values], [columns, index]) + # ---------------------------------------------------------------------- @property def axes(self): @@ -642,6 +533,9 @@ def _is_homogeneous_type(self): else: return not self._data.is_mixed_type + # ---------------------------------------------------------------------- + # Rendering Methods + def _repr_fits_vertical_(self): """ Check length against max_rows. @@ -652,10 +546,11 @@ def _repr_fits_vertical_(self): def _repr_fits_horizontal_(self, ignore_width=False): """ Check if full repr fits in horizontal boundaries imposed by the display - options width and max_columns. In case off non-interactive session, no - boundaries apply. + options width and max_columns. + + In case off non-interactive session, no boundaries apply. - ignore_width is here so ipnb+HTML output can behave the way + `ignore_width` is here so ipnb+HTML output can behave the way users expect. display.max_columns remains in effect. GH3541, GH3573 """ @@ -703,14 +598,16 @@ def _repr_fits_horizontal_(self, ignore_width=False): return repr_width < width def _info_repr(self): - """True if the repr should show the info view.""" + """ + True if the repr should show the info view. + """ info_repr_option = (get_option("display.large_repr") == "info") return info_repr_option and not (self._repr_fits_horizontal_() and self._repr_fits_vertical_()) def __unicode__(self): """ - Return a string representation for a particular DataFrame + Return a string representation for a particular DataFrame. Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. @@ -735,6 +632,7 @@ def __unicode__(self): def _repr_html_(self): """ Return a html representation for a particular DataFrame. + Mainly for IPython notebook. """ # qtconsole doesn't report its line width, and also @@ -765,6 +663,57 @@ def _repr_html_(self): else: return None + @Substitution(header='Write out the column names. If a list of strings ' + 'is given, it is assumed to be aliases for the ' + 'column names') + @Substitution(shared_params=fmt.common_docstring, + returns=fmt.return_docstring) + def to_string(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, justify=None, + max_rows=None, max_cols=None, show_dimensions=False, + decimal='.', line_width=None): + """ + Render a DataFrame to a console-friendly tabular output. + %(shared_params)s + line_width : int, optional + Width to wrap a line in characters. + %(returns)s + See Also + -------- + to_html : Convert DataFrame to HTML. + + Examples + -------- + >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} + >>> df = pd.DataFrame(d) + >>> print(df.to_string()) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + """ + + formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, justify=justify, + index_names=index_names, + header=header, index=index, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width) + formatter.to_string() + + if buf is None: + result = formatter.buf.getvalue() + return result + + # ---------------------------------------------------------------------- + @property def style(self): """ @@ -864,16 +813,20 @@ def iterrows(self): data types, the iterator returns a copy and not a view, and writing to it will have no effect. - Returns - ------- + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + it : generator A generator that iterates over the rows of the frame. - See also + See Also -------- itertuples : Iterate over DataFrame rows as namedtuples of the values. iteritems : Iterate over (column name, Series) pairs. - """ columns = self.columns klass = self._constructor_sliced @@ -971,12 +924,14 @@ def itertuples(self, index=True, name="Pandas"): items = iteritems def __len__(self): - """Returns length of info axis, but here we use the index """ + """ + Returns length of info axis, but here we use the index. + """ return len(self.index) def dot(self, other): """ - Matrix multiplication with DataFrame or Series objects. Can also be + Matrix multiplication with DataFrame or Series objects. Can also be called using `self @ other` in Python >= 3.5. Parameters @@ -1003,7 +958,7 @@ def dot(self, other): rvals = np.asarray(other) if lvals.shape[1] != rvals.shape[0]: raise ValueError('Dot product shape mismatch, ' - '{l} vs {r}'.format(l=lvals.shape, + '{s} vs {r}'.format(s=lvals.shape, r=rvals.shape)) if isinstance(other, DataFrame): @@ -1021,11 +976,15 @@ def dot(self, other): raise TypeError('unsupported type: {oth}'.format(oth=type(other))) def __matmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.dot(other) def __rmatmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.T.dot(np.transpose(other)).T # ---------------------------------------------------------------------- @@ -1062,8 +1021,8 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): See Also -------- DataFrame.from_records : DataFrame from ndarray (structured - dtype), list of tuples, dict, or DataFrame - DataFrame : DataFrame object creation using constructor + dtype), list of tuples, dict, or DataFrame. + DataFrame : DataFrame object creation using constructor. Examples -------- @@ -1113,6 +1072,50 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) + def to_numpy(self): + """ + Convert the DataFrame to a NumPy array. + + .. versionadded:: 0.24.0 + + The dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, + if the dtypes are ``float16`` and ``float32``, the results + dtype will be ``float32``. This may require copying data and + coercing values, which may be expensive. + + Returns + ------- + array : numpy.ndarray + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogenous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df['C'] = pd.date_range('2000', periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + return self.values + def to_dict(self, orient='dict', into=dict): """ Convert the DataFrame to a dictionary. @@ -1147,58 +1150,60 @@ def to_dict(self, orient='dict', into=dict): Returns ------- - result : collections.Mapping like {column -> {index -> value}} + dict, list or collections.Mapping + Return a collections.Mapping object representing the DataFrame. + The resulting transformation depends on the `orient` parameter. See Also -------- - DataFrame.from_dict: create a DataFrame from a dictionary - DataFrame.to_json: convert a DataFrame to JSON format + DataFrame.from_dict: Create a DataFrame from a dictionary. + DataFrame.to_json: Convert a DataFrame to JSON format. Examples -------- >>> df = pd.DataFrame({'col1': [1, 2], ... 'col2': [0.5, 0.75]}, - ... index=['a', 'b']) + ... index=['row1', 'row2']) >>> df - col1 col2 - a 1 0.50 - b 2 0.75 + col1 col2 + row1 1 0.50 + row2 2 0.75 >>> df.to_dict() - {'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}} + {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} You can specify the return orientation. >>> df.to_dict('series') - {'col1': a 1 - b 2 - Name: col1, dtype: int64, - 'col2': a 0.50 - b 0.75 - Name: col2, dtype: float64} + {'col1': row1 1 + row2 2 + Name: col1, dtype: int64, + 'col2': row1 0.50 + row2 0.75 + Name: col2, dtype: float64} >>> df.to_dict('split') - {'index': ['a', 'b'], 'columns': ['col1', 'col2'], - 'data': [[1.0, 0.5], [2.0, 0.75]]} + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]]} >>> df.to_dict('records') - [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}] + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] >>> df.to_dict('index') - {'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}} + {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} You can also specify the mapping type. >>> from collections import OrderedDict, defaultdict >>> df.to_dict(into=OrderedDict) - OrderedDict([('col1', OrderedDict([('a', 1), ('b', 2)])), - ('col2', OrderedDict([('a', 0.5), ('b', 0.75)]))]) + OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), + ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) If you want a `defaultdict`, you need to initialize it: >>> dd = defaultdict(list) >>> df.to_dict('records', into=dd) - [defaultdict(, {'col1': 1.0, 'col2': 0.5}), - defaultdict(, {'col1': 2.0, 'col2': 0.75})] + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] """ if not self.columns.is_unique: warnings.warn("DataFrame columns are not unique, some " @@ -1214,26 +1219,32 @@ def to_dict(self, orient='dict', into=dict): elif orient.lower().startswith('sp'): return into_c((('index', self.index.tolist()), ('columns', self.columns.tolist()), - ('data', lib.map_infer(self.values.ravel(), - com.maybe_box_datetimelike) - .reshape(self.values.shape).tolist()))) + ('data', [ + list(map(com.maybe_box_datetimelike, t)) + for t in self.itertuples(index=False)] + ))) elif orient.lower().startswith('s'): return into_c((k, com.maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [into_c((k, com.maybe_box_datetimelike(v)) - for k, v in zip(self.columns, np.atleast_1d(row))) - for row in self.values] + return [ + into_c((k, com.maybe_box_datetimelike(v)) + for k, v in compat.iteritems(row._asdict())) + for row in self.itertuples(index=False)] elif orient.lower().startswith('i'): + if not self.index.is_unique: + raise ValueError( + "DataFrame index must be unique for orient='index'." + ) return into_c((t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples()) else: raise ValueError("orient '{o}' not understood".format(o=orient)) def to_gbq(self, destination_table, project_id=None, chunksize=None, - reauth=False, if_exists='fail', private_key=None, - auth_local_webserver=False, table_schema=None, location=None, - progress_bar=True, verbose=None): + reauth=False, if_exists='fail', auth_local_webserver=False, + table_schema=None, location=None, progress_bar=True, + credentials=None, verbose=None, private_key=None): """ Write a DataFrame to a Google BigQuery table. @@ -1266,10 +1277,6 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None, If table exists, drop it, recreate it, and insert data. ``'append'`` If table exists, insert data. Create if does not exist. - private_key : str, optional - Service account private key in JSON format. Can be file path - or string contents. This is useful for remote server - authentication (eg. Jupyter/IPython notebook on remote host). auth_local_webserver : bool, default False Use the `local webserver flow`_ instead of the `console flow`_ when getting user credentials. @@ -1301,10 +1308,31 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None, chunk by chunk. *New in version 0.5.0 of pandas-gbq*. + credentials : google.auth.credentials.Credentials, optional + Credentials for accessing Google APIs. Use this parameter to + override default credentials, such as to use Compute Engine + :class:`google.auth.compute_engine.Credentials` or Service + Account :class:`google.oauth2.service_account.Credentials` + directly. + + *New in version 0.8.0 of pandas-gbq*. + + .. versionadded:: 0.24.0 verbose : bool, deprecated - Deprecated in Pandas-GBQ 0.4.0. Use the `logging module + Deprecated in pandas-gbq version 0.4.0. Use the `logging module to adjust verbosity instead `__. + private_key : str, deprecated + Deprecated in pandas-gbq version 0.8.0. Use the ``credentials`` + parameter and + :func:`google.oauth2.service_account.Credentials.from_service_account_info` + or + :func:`google.oauth2.service_account.Credentials.from_service_account_file` + instead. + + Service account private key in JSON format. Can be file path + or string contents. This is useful for remote server + authentication (eg. Jupyter/IPython notebook on remote host). See Also -------- @@ -1314,17 +1342,17 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None, from pandas.io import gbq return gbq.to_gbq( self, destination_table, project_id=project_id, - chunksize=chunksize, reauth=reauth, - if_exists=if_exists, private_key=private_key, + chunksize=chunksize, reauth=reauth, if_exists=if_exists, auth_local_webserver=auth_local_webserver, table_schema=table_schema, location=location, - progress_bar=progress_bar, verbose=verbose) + progress_bar=progress_bar, credentials=credentials, + verbose=verbose, private_key=private_key) @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, coerce_float=False, nrows=None): """ - Convert structured or record ndarray to DataFrame + Convert structured or record ndarray to DataFrame. Parameters ---------- @@ -1392,17 +1420,17 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns.append(k) arrays.append(v) - arrays, arr_columns = _reorder_arrays(arrays, arr_columns, - columns) + arrays, arr_columns = reorder_arrays(arrays, arr_columns, + columns) elif isinstance(data, (np.ndarray, DataFrame)): - arrays, columns = _to_arrays(data, columns) + arrays, columns = to_arrays(data, columns) if columns is not None: columns = ensure_index(columns) arr_columns = columns else: - arrays, arr_columns = _to_arrays(data, columns, - coerce_float=coerce_float) + arrays, arr_columns = to_arrays(data, columns, + coerce_float=coerce_float) arr_columns = ensure_index(arr_columns) if columns is not None: @@ -1444,7 +1472,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = _arrays_to_mgr(arrays, arr_columns, result_index, columns) + mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) return cls(mgr) @@ -1474,9 +1502,9 @@ def to_records(self, index=True, convert_datetime64=None): See Also -------- - DataFrame.from_records: convert structured or record ndarray + DataFrame.from_records: Convert structured or record ndarray to DataFrame. - numpy.recarray: ndarray that allows field access using + numpy.recarray: An ndarray that allows field access using attributes, analogous to typed columns in a spreadsheet. @@ -1549,7 +1577,8 @@ def to_records(self, index=True, convert_datetime64=None): @classmethod def from_items(cls, items, columns=None, orient='columns'): - """Construct a dataframe from a list of tuples + """ + Construct a DataFrame from a list of tuples. .. deprecated:: 0.23.0 `from_items` is deprecated and will be removed in a future version. @@ -1636,14 +1665,15 @@ def from_items(cls, items, columns=None, orient='columns'): @classmethod def _from_arrays(cls, arrays, columns, index, dtype=None): - mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) @classmethod def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True, encoding=None, tupleize_cols=None, infer_datetime_format=False): - """Read CSV file. + """ + Read CSV file. .. deprecated:: 0.21.0 Use :func:`pandas.read_csv` instead. @@ -1679,19 +1709,18 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True, tupleize_cols : boolean, default False write multi_index columns as a list of tuples (if True) or new (expanded format) if False) - infer_datetime_format: boolean, default False + infer_datetime_format : boolean, default False If True and `parse_dates` is True for a column, try to infer the datetime format based on the first datetime string. If the format can be inferred, there often will be a large parsing speed-up. - See also + See Also -------- pandas.read_csv Returns ------- y : DataFrame - """ warnings.warn("from_csv is deprecated. Please use read_csv(...) " @@ -1760,7 +1789,7 @@ def to_sparse(self, fill_value=None, kind='block'): >>> type(sdf) """ - from pandas.core.sparse.frame import SparseDataFrame + from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame(self._series, index=self.index, columns=self.columns, default_kind=kind, default_fill_value=fill_value) @@ -1797,7 +1826,7 @@ def to_panel(self): selfsorted = self major_axis, minor_axis = selfsorted.index.levels - major_labels, minor_labels = selfsorted.index.labels + major_codes, minor_codes = selfsorted.index.codes shape = len(major_axis), len(minor_axis) # preserve names, if any @@ -1812,44 +1841,29 @@ def to_panel(self): # create new manager new_mgr = selfsorted._data.reshape_nd(axes=new_axes, - labels=[major_labels, - minor_labels], + labels=[major_codes, + minor_codes], shape=shape, ref_items=selfsorted.columns) return self._constructor_expanddim(new_mgr) - @Appender(_shared_docs['to_excel'] % _shared_doc_kwargs) - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True, - freeze_panes=None): - - from pandas.io.formats.excel import ExcelFormatter - formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns, - header=header, - float_format=float_format, index=index, - index_label=index_label, - merge_cells=merge_cells, - inf_rep=inf_rep) - formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, - startcol=startcol, freeze_panes=freeze_panes, - engine=engine) - @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None, version=114, convert_strl=None): """ - Export Stata binary dta files. + Export DataFrame object to Stata dta format. + + Writes the DataFrame to a Stata dataset file. + "dta" files contain a Stata dataset. Parameters ---------- - fname : path (string), buffer or path object - string, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() functions. If using a buffer + fname : str, buffer or path object + String, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() function. If using a buffer then the buffer will not be automatically closed after the file data has been written. convert_dates : dict @@ -1868,7 +1882,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, time_stamp : datetime A datetime to use as file creation date. Default is the current time. - data_label : str + data_label : str, optional A label for the data set. Must be 80 characters or smaller. variable_labels : dict Dictionary containing columns as keys and variable labels as @@ -1876,7 +1890,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, .. versionadded:: 0.19.0 - version : {114, 117} + version : {114, 117}, default 114 Version to use in the output dta file. Version 114 can be used read by Stata 10 and later. Version 117 can be read by Stata 13 or later. Version 114 limits string variables to 244 characters or @@ -1908,27 +1922,16 @@ def to_stata(self, fname, convert_dates=None, write_index=True, See Also -------- - pandas.read_stata : Import Stata data files - pandas.io.stata.StataWriter : low-level writer for Stata data files - pandas.io.stata.StataWriter117 : low-level writer for version 117 files + read_stata : Import Stata data files. + io.stata.StataWriter : Low-level writer for Stata data files. + io.stata.StataWriter117 : Low-level writer for version 117 files. Examples -------- - >>> data.to_stata('./data_file.dta') - - Or with dates - - >>> data.to_stata('./date_data_file.dta', {2 : 'tw'}) - - Alternatively you can create an instance of the StataWriter class - - >>> writer = StataWriter('./data_file.dta', data) - >>> writer.write_file() - - With dates: - - >>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'}) - >>> writer.write_file() + >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', + ... 'parrot'], + ... 'speed': [350, 18, 361, 15]}) + >>> df.to_stata('animals.dta') # doctest: +SKIP """ kwargs = {} if version not in (114, 117): @@ -1950,7 +1953,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, def to_feather(self, fname): """ - write out the binary feather-format for DataFrames + Write out the binary feather-format for DataFrames. .. versionadded:: 0.20.0 @@ -1958,13 +1961,12 @@ def to_feather(self, fname): ---------- fname : str string file path - """ from pandas.io.feather_format import to_feather to_feather(self, fname) def to_parquet(self, fname, engine='auto', compression='snappy', - index=None, **kwargs): + index=None, partition_cols=None, **kwargs): """ Write a DataFrame to the binary parquet format. @@ -1978,7 +1980,11 @@ def to_parquet(self, fname, engine='auto', compression='snappy', Parameters ---------- fname : str - String file path. + File path or Root Directory path. Will be used as Root Directory + path while writing a partitioned dataset. + + .. versionchanged:: 0.24.0 + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` @@ -1993,6 +1999,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy', .. versionadded:: 0.24.0 + partition_cols : list, optional, default None + Column names by which to partition the dataset + Columns are partitioned in the order they are given + + .. versionadded:: 0.24.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. @@ -2013,93 +2025,38 @@ def to_parquet(self, fname, engine='auto', compression='snappy', Examples -------- >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) - >>> df.to_parquet('df.parquet.gzip', compression='gzip') - >>> pd.read_parquet('df.parquet.gzip') + >>> df.to_parquet('df.parquet.gzip', + ... compression='gzip') # doctest: +SKIP + >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP col1 col2 0 1 3 1 2 4 """ from pandas.io.parquet import to_parquet to_parquet(self, fname, engine, - compression=compression, index=index, **kwargs) - - @Substitution(header='Write out the column names. If a list of strings ' - 'is given, it is assumed to be aliases for the ' - 'column names') - @Substitution(shared_params=fmt.common_docstring, - returns=fmt.return_docstring) - def to_string(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, - line_width=None, max_rows=None, max_cols=None, - show_dimensions=False): - """ - Render a DataFrame to a console-friendly tabular output. - - %(shared_params)s - line_width : int, optional - Width to wrap a line in characters. - - %(returns)s - - See Also - -------- - to_html : Convert DataFrame to HTML. + compression=compression, index=index, + partition_cols=partition_cols, **kwargs) - Examples - -------- - >>> d = {'col1' : [1, 2, 3], 'col2' : [4, 5, 6]} - >>> df = pd.DataFrame(d) - >>> print(df.to_string()) - col1 col2 - 0 1 4 - 1 2 5 - 2 3 6 - """ - - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, justify=justify, - index_names=index_names, - header=header, index=index, - line_width=line_width, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions) - formatter.to_string() - - if buf is None: - result = formatter.buf.getvalue() - return result - - @Substitution(header='whether to print column labels, default True') + @Substitution(header='Whether to print column labels, default True') @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, bold_rows=True, - classes=None, escape=True, max_rows=None, max_cols=None, - show_dimensions=False, notebook=False, decimal='.', - border=None, table_id=None): + sparsify=None, index_names=True, justify=None, max_rows=None, + max_cols=None, show_dimensions=False, decimal='.', + bold_rows=True, classes=None, escape=True, + notebook=False, border=None, table_id=None): """ Render a DataFrame as an HTML table. - %(shared_params)s - bold_rows : boolean, default True - Make the row labels bold in the output + bold_rows : bool, default True + Make the row labels bold in the output. classes : str or list or tuple, default None - CSS class(es) to apply to the resulting html table - escape : boolean, default True + CSS class(es) to apply to the resulting html table. + escape : bool, default True Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. - decimal : string, default '.' - Character recognized as decimal separator, e.g. ',' in Europe - - .. versionadded:: 0.18.0 - border : int A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.html.border``. @@ -2110,9 +2067,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, A css id is included in the opening `
` tag if specified. .. versionadded:: 0.23.0 - %(returns)s - See Also -------- to_string : Convert DataFrame to a string. @@ -2140,6 +2095,8 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, if buf is None: return formatter.buf.getvalue() + # ---------------------------------------------------------------------- + def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): """ @@ -2237,7 +2194,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, >>> buffer = io.StringIO() >>> df.info(buf=buffer) >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", encoding="utf-8") as f: + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP ... f.write(s) 260 @@ -2605,7 +2563,8 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover # Getting and setting elements def get_value(self, index, col, takeable=False): - """Quickly retrieve single value at passed column and index + """ + Quickly retrieve single value at passed column and index. .. deprecated:: 0.21.0 Use .at[] or .iat[] accessors instead. @@ -2648,7 +2607,8 @@ def _get_value(self, index, col, takeable=False): _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): - """Put single value at passed column and index + """ + Put single value at passed column and index. .. deprecated:: 0.21.0 Use .at[] or .iat[] accessors instead. @@ -2693,18 +2653,17 @@ def _set_value(self, index, col, value, takeable=False): def _ixs(self, i, axis=0): """ + Parameters + ---------- i : int, slice, or sequence of integers axis : int - """ + Notes + ----- + If slice passed, the resulting data will be a view. + """ # irow if axis == 0: - """ - Notes - ----- - If slice passed, the resulting data will be a view - """ - if isinstance(i, slice): return self[i] else: @@ -2730,12 +2689,6 @@ def _ixs(self, i, axis=0): # icol else: - """ - Notes - ----- - If slice passed, the resulting data will be a view - """ - label = self.columns[i] if isinstance(i, slice): # need to return view @@ -2882,7 +2835,8 @@ def _getitem_frame(self, key): return self.where(key) def query(self, expr, inplace=False, **kwargs): - """Query the columns of a frame with a boolean expression. + """ + Query the columns of a DataFrame with a boolean expression. Parameters ---------- @@ -3152,6 +3106,14 @@ def select_dtypes(self, include=None, exclude=None): 4 True 1.0 5 False 2.0 """ + def _get_info_slice(obj, indexer): + """Slice the info axis of `obj` with `indexer`.""" + if not hasattr(obj, '_info_axis_number'): + msg = 'object of type {typ!r} has no info axis' + raise TypeError(msg.format(typ=type(obj).__name__)) + slices = [slice(None)] * obj.ndim + slices[obj._info_axis_number] = indexer + return tuple(slices) if not is_list_like(include): include = (include,) if include is not None else () @@ -3200,7 +3162,7 @@ def is_dtype_instance_mapper(idx, dtype): exclude_these.iloc[idx] = not any(map(f, exclude)) dtype_indexer = include_these & exclude_these - return self.loc[com.get_info_slice(self, dtype_indexer)] + return self.loc[_get_info_slice(self, dtype_indexer)] def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] @@ -3210,8 +3172,10 @@ def _box_item_values(self, key, values): return self._box_col_values(values, items) def _box_col_values(self, values, items): - """ provide boxed values for a column """ - klass = _get_sliced_frame_result_type(values, self) + """ + Provide boxed values for a column. + """ + klass = self._constructor_sliced return klass(values, index=self.index, name=items, fastpath=True) def __setitem__(self, key, value): @@ -3276,8 +3240,8 @@ def _setitem_frame(self, key, value): def _ensure_valid_index(self, value): """ - ensure that if we don't have an index, that we can create one from the - passed value + Ensure that if we don't have an index, that we can create one from the + passed value. """ # GH5632, make sure that we are a Series convertible if not len(self.index) and is_list_like(value): @@ -3380,6 +3344,7 @@ def assign(self, **kwargs): Berkeley 25.0 Where the value is a callable, evaluated on `df`: + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 @@ -3387,6 +3352,7 @@ def assign(self, **kwargs): Alternatively, the same behavior can be achieved by directly referencing an existing Series or sequence: + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 @@ -3395,6 +3361,7 @@ def assign(self, **kwargs): In Python 3.6+, you can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) temp_c temp_f temp_k @@ -3477,17 +3444,15 @@ def reindexer(value): value = reindexer(value).T elif isinstance(value, ExtensionArray): - from pandas.core.series import _sanitize_index - # Explicitly copy here, instead of in _sanitize_index, + # Explicitly copy here, instead of in sanitize_index, # as sanitize_index won't copy an EA, even with copy=True value = value.copy() - value = _sanitize_index(value, self.index, copy=False) + value = sanitize_index(value, self.index, copy=False) elif isinstance(value, Index) or is_sequence(value): - from pandas.core.series import _sanitize_index # turn me into an ndarray - value = _sanitize_index(value, self.index, copy=False) + value = sanitize_index(value, self.index, copy=False) if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: value = maybe_convert_platform(value) @@ -3529,14 +3494,13 @@ def reindexer(value): @property def _series(self): - result = {} - for idx, item in enumerate(self.columns): - result[item] = Series(self._data.iget(idx), index=self.index, - name=item) - return result + return {item: Series(self._data.iget(idx), index=self.index, name=item) + for idx, item in enumerate(self.columns)} def lookup(self, row_labels, col_labels): - """Label-based "fancy indexing" function for DataFrame. + """ + Label-based "fancy indexing" function for DataFrame. + Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. @@ -3551,15 +3515,13 @@ def lookup(self, row_labels, col_labels): ----- Akin to:: - result = [] - for row, col in zip(row_labels, col_labels): - result.append(df.get_value(row, col)) + result = [df.get_value(row, col) + for row, col in zip(row_labels, col_labels)] Examples -------- values : ndarray The found values - """ n = len(row_labels) if n != len(col_labels): @@ -3624,7 +3586,9 @@ def _reindex_columns(self, new_columns, method, copy, level, allow_dups=False) def _reindex_multi(self, axes, copy, fill_value): - """ we are guaranteed non-Nones in the axes! """ + """ + We are guaranteed non-Nones in the axes. + """ new_index, row_indexer = self.index.reindex(axes['index']) new_columns, col_indexer = self.columns.reindex(axes['columns']) @@ -3715,9 +3679,9 @@ def drop(self, labels=None, axis=0, index=None, columns=None, -------- DataFrame.loc : Label-location based indexer for selection by label. DataFrame.dropna : Return DataFrame with labels on given axis omitted - where (all or any) data are missing + where (all or any) data are missing. DataFrame.drop_duplicates : Return DataFrame with duplicate rows - removed, optionally only considering certain columns + removed, optionally only considering certain columns. Series.drop : Return Series with specified index labels removed. Raises @@ -3759,8 +3723,8 @@ def drop(self, labels=None, axis=0, index=None, columns=None, >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], - ... labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], ... [250, 150], [1.5, 0.8], [320, 250], @@ -3804,7 +3768,8 @@ def drop(self, labels=None, axis=0, index=None, columns=None, ('inplace', False), ('level', None)]) def rename(self, *args, **kwargs): - """Alter axes labels. + """ + Alter axes labels. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don't throw an @@ -3909,43 +3874,58 @@ def shift(self, periods=1, freq=None, axis=0): def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): """ + Set the DataFrame index using existing columns. + Set the DataFrame index (row labels) using one or more existing - columns. By default yields a new object. + columns. The index can replace the existing index or expand on it. Parameters ---------- - keys : column label or list of column labels / arrays - drop : boolean, default True - Delete columns to be used as the new index - append : boolean, default False - Whether to append columns to existing index - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object) - verify_integrity : boolean, default False + keys : label or list of label + Name or names of the columns that will be used as the index. + drop : bool, default True + Delete columns to be used as the new index. + append : bool, default False + Whether to append columns to existing index. + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this - method + method. + + Returns + ------- + DataFrame + Changed row labels. + + See Also + -------- + DataFrame.reset_index : Opposite of set_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. Examples -------- >>> df = pd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], - ... 'sale':[55, 40, 84, 31]}) - month sale year - 0 1 55 2012 - 1 4 40 2014 - 2 7 84 2013 - 3 10 31 2014 + ... 'sale': [55, 40, 84, 31]}) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 Set the index to become the 'month' column: >>> df.set_index('month') - sale year + year sale month - 1 55 2012 - 4 40 2014 - 7 84 2013 - 10 31 2014 + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 Create a multi-index using columns 'year' and 'month': @@ -3966,15 +3946,30 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 2 2014 4 40 3 2013 7 84 4 2014 10 31 - - Returns - ------- - dataframe : DataFrame """ inplace = validate_bool_kwarg(inplace, 'inplace') if not isinstance(keys, list): keys = [keys] + missing = [] + for col in keys: + if (is_scalar(col) or isinstance(col, tuple)) and col in self: + # tuples can be both column keys or list-likes + # if they are valid column keys, everything is fine + continue + elif is_scalar(col) and col not in self: + # tuples that are not column keys are considered list-like, + # not considered missing + missing.append(col) + elif (not is_list_like(col, allow_sets=False) + or getattr(col, 'ndim', 1) > 1): + raise TypeError('The parameter "keys" may only contain a ' + 'combination of valid column keys and ' + 'one-dimensional list-likes') + + if missing: + raise KeyError('{}'.format(missing)) + if inplace: frame = self else: @@ -3984,7 +3979,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, names = [] if append: names = [x for x in self.index.names] - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: @@ -3992,29 +3987,29 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove = [] for col in keys: - if isinstance(col, MultiIndex): - # append all but the last column so we don't have to modify - # the end of this loop - for n in range(col.nlevels - 1): + if isinstance(col, ABCMultiIndex): + for n in range(col.nlevels): arrays.append(col._get_level_values(n)) - - level = col._get_level_values(col.nlevels - 1) names.extend(col.names) - elif isinstance(col, Series): - level = col._values - names.append(col.name) - elif isinstance(col, Index): - level = col + elif isinstance(col, (ABCIndexClass, ABCSeries)): + # if Index then not MultiIndex (treated above) + arrays.append(col) names.append(col.name) - elif isinstance(col, (list, np.ndarray, Index)): - level = col + elif isinstance(col, (list, np.ndarray)): + arrays.append(col) + names.append(None) + elif (is_list_like(col) + and not (isinstance(col, tuple) and col in self)): + # all other list-likes (but avoid valid column keys) + col = list(col) # ensure iterator do not get read twice etc. + arrays.append(col) names.append(None) + # from here, col can only be a column label else: - level = frame[col]._values + arrays.append(frame[col]._values) names.append(col) if drop: to_remove.append(col) - arrays.append(level) index = ensure_index_from_sequences(arrays, names) @@ -4023,7 +4018,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, raise ValueError('Index has duplicate keys: {dup}'.format( dup=duplicates)) - for c in to_remove: + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): del frame[c] # clear up memory usage @@ -4037,22 +4033,22 @@ def set_index(self, keys, drop=True, append=False, inplace=False, def reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=''): """ - For DataFrame with multi-level index, return new DataFrame with - labeling information in the columns under the index names, defaulting - to 'level_0', 'level_1', etc. if any are None. For a standard index, - the index name will be used (if set), otherwise a default 'index' or - 'level_0' (if 'index' is already taken) will be used. + Reset the index, or a level of it. + + Reset the index of the DataFrame, and use the default one instead. + If the DataFrame has a MultiIndex, this method can remove one or more + levels. Parameters ---------- level : int, str, tuple, or list, default None Only remove the given levels from the index. Removes all levels by - default - drop : boolean, default False + default. + drop : bool, default False Do not try to insert index into dataframe columns. This resets the index to the default integer index. - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object) + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). col_level : int or str, default 0 If the columns have multiple levels, determines which level the labels are inserted into. By default it is inserted into the first @@ -4063,13 +4059,20 @@ def reset_index(self, level=None, drop=False, inplace=False, col_level=0, Returns ------- - resetted : DataFrame + DataFrame + DataFrame with the new index. + + See Also + -------- + DataFrame.set_index : Opposite of reset_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. Examples -------- - >>> df = pd.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), + >>> df = pd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), ... ('mammal', np.nan)], ... index=['falcon', 'parrot', 'lion', 'monkey'], ... columns=('class', 'max_speed')) @@ -4210,7 +4213,7 @@ def _maybe_casted_values(index, labels=None): if isinstance(self.index, MultiIndex): names = [n if n is not None else ('level_%d' % i) for (i, n) in enumerate(self.index.names)] - to_insert = lzip(self.index.levels, self.index.labels) + to_insert = lzip(self.index.levels, self.index.codes) else: default = 'index' if 'index' not in self else 'level_0' names = ([default] if self.index.name is None @@ -4416,7 +4419,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, def drop_duplicates(self, subset=None, keep='first', inplace=False): """ Return DataFrame with duplicate rows removed, optionally only - considering certain columns + considering certain columns. Parameters ---------- @@ -4450,7 +4453,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): def duplicated(self, subset=None, keep='first'): """ Return boolean Series denoting duplicate rows, optionally only - considering certain columns + considering certain columns. Parameters ---------- @@ -4518,10 +4521,8 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if len(by) > 1: from pandas.core.sorting import lexsort_indexer - keys = [] - for x in by: - k = self._get_label_or_level_values(x, axis=axis) - keys.append(k) + keys = [self._get_label_or_level_values(x, axis=axis) + for x in by] indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) indexer = ensure_platform_int(indexer) @@ -4580,7 +4581,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer(labels._get_labels_for_sorting(), + indexer = lexsort_indexer(labels._get_codes_for_sorting(), orders=ascending, na_position=na_position) else: @@ -4611,40 +4612,6 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return self._constructor(new_data).__finalize__(self) - def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, - sort_remaining=True): - """Sort multilevel index by chosen axis and primary level. Data will be - lexicographically sorted by the chosen level followed by the other - levels (in order). - - .. deprecated:: 0.20.0 - Use :meth:`DataFrame.sort_index` - - - Parameters - ---------- - level : int - axis : {0 or 'index', 1 or 'columns'}, default 0 - ascending : boolean, default True - inplace : boolean, default False - Sort the DataFrame without creating a new instance - sort_remaining : boolean, default True - Sort by the other levels too. - - Returns - ------- - sorted : DataFrame - - See Also - -------- - DataFrame.sort_index(level=...) - - """ - warnings.warn("sortlevel is deprecated, use sort_index(level= ...)", - FutureWarning, stacklevel=2) - return self.sort_index(level=level, axis=axis, ascending=ascending, - inplace=inplace, sort_remaining=sort_remaining) - def nlargest(self, n, columns, keep='first'): """ Return the first `n` rows ordered by `columns` in descending order. @@ -4683,7 +4650,7 @@ def nlargest(self, n, columns, keep='first'): -------- DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in ascending order. - DataFrame.sort_values : Sort DataFrame by the values + DataFrame.sort_values : Sort DataFrame by the values. DataFrame.head : Return the first `n` rows without re-ordering. Notes @@ -4694,60 +4661,63 @@ def nlargest(self, n, columns, keep='first'): Examples -------- - >>> df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2], - ... 'b': list('abdcef'), - ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]}) + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) >>> df - a b c - 0 1 a 1.0 - 1 10 b 2.0 - 2 8 d NaN - 3 11 c 3.0 - 4 8 e 4.0 - 5 2 f 9.0 + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI In the following example, we will use ``nlargest`` to select the three - rows having the largest values in column "a". + rows having the largest values in column "population". - >>> df.nlargest(3, 'a') - a b c - 3 11 c 3.0 - 1 10 b 2.0 - 2 8 d NaN + >>> df.nlargest(3, 'population') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nlargest(3, 'a', keep='last') - a b c - 3 11 c 3.0 - 1 10 b 2.0 - 4 8 e 4.0 + >>> df.nlargest(3, 'population', keep='last') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN When using ``keep='all'``, all duplicate items are maintained: - >>> df.nlargest(3, 'a', keep='all') - a b c - 3 11 c 3.0 - 1 10 b 2.0 - 2 8 d NaN - 4 8 e 4.0 - - To order by the largest values in column "a" and then "c", we can - specify multiple columns like in the next example. - - >>> df.nlargest(3, ['a', 'c']) - a b c - 4 8 e 4.0 - 3 11 c 3.0 - 1 10 b 2.0 + >>> df.nlargest(3, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN - Attempting to use ``nlargest`` on non-numeric dtypes will raise a - ``TypeError``: + To order by the largest values in column "population" and then "GDP", + we can specify multiple columns like in the next example. - >>> df.nlargest(3, 'b') - - Traceback (most recent call last): - TypeError: Column 'b' has dtype object, cannot use method 'nlargest' + >>> df.nlargest(3, ['population', 'GDP']) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN """ return algorithms.SelectNFrame(self, n=n, @@ -4755,15 +4725,23 @@ def nlargest(self, n, columns, keep='first'): columns=columns).nlargest() def nsmallest(self, n, columns, keep='first'): - """Get the rows of a DataFrame sorted by the `n` smallest - values of `columns`. + """ + Return the first `n` rows ordered by `columns` in ascending order. + + Return the first `n` rows with the smallest values in `columns`, in + ascending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=True).head(n)``, but more + performant. Parameters ---------- n : int - Number of items to retrieve + Number of items to retrieve. columns : list or str - Column name or names to order by + Column name or names to order by. keep : {'first', 'last', 'all'}, default 'first' Where there are duplicate values: @@ -4778,62 +4756,70 @@ def nsmallest(self, n, columns, keep='first'): ------- DataFrame + See Also + -------- + DataFrame.nlargest : Return the first `n` rows ordered by `columns` in + descending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + Examples -------- - >>> df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2], - ... 'b': list('abdcef'), - ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]}) + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) >>> df - a b c - 0 1 a 1.0 - 1 10 b 2.0 - 2 8 d NaN - 3 11 c 3.0 - 4 8 e 4.0 - 5 2 f 9.0 + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI In the following example, we will use ``nsmallest`` to select the three rows having the smallest values in column "a". - >>> df.nsmallest(3, 'a') - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 2 8 d NaN + >>> df.nsmallest(3, 'population') + population GDP alpha-2 + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nsmallest(3, 'a', keep='last') - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 4 8 e 4.0 + >>> df.nsmallest(3, 'population', keep='last') + population GDP alpha-2 + Anguilla 11300 311 AI + Tuvalu 11300 38 TV + Nauru 11300 182 NR When using ``keep='all'``, all duplicate items are maintained: - >>> df.nsmallest(3, 'a', keep='all') - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 2 8 d NaN - 4 8 e 4.0 + >>> df.nsmallest(3, 'population', keep='all') + population GDP alpha-2 + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI To order by the largest values in column "a" and then "c", we can specify multiple columns like in the next example. - >>> df.nsmallest(3, ['a', 'c']) - a b c - 0 1 a 1.0 - 5 2 f 9.0 - 4 8 e 4.0 - - Attempting to use ``nsmallest`` on non-numeric dtypes will raise a - ``TypeError``: - - >>> df.nsmallest(3, 'b') - - Traceback (most recent call last): - TypeError: Column 'b' has dtype object, cannot use method 'nsmallest' + >>> df.nsmallest(3, ['population', 'GDP']) + population GDP alpha-2 + Tuvalu 11300 38 TV + Nauru 11300 182 NR + Anguilla 11300 311 AI """ return algorithms.SelectNFrame(self, n=n, @@ -4842,7 +4828,7 @@ def nsmallest(self, n, columns, keep='first'): def swaplevel(self, i=-2, j=-1, axis=0): """ - Swap levels i and j in a MultiIndex on a particular axis + Swap levels i and j in a MultiIndex on a particular axis. Parameters ---------- @@ -4857,7 +4843,6 @@ def swaplevel(self, i=-2, j=-1, axis=0): The indexes ``i`` and ``j`` are now optional, and default to the two innermost levels of the index. - """ result = self.copy() @@ -4870,8 +4855,8 @@ def swaplevel(self, i=-2, j=-1, axis=0): def reorder_levels(self, order, axis=0): """ - Rearrange index levels using input order. - May not drop or duplicate levels + Rearrange index levels using input order. May not drop or + duplicate levels. Parameters ---------- @@ -4937,21 +4922,16 @@ def _combine_match_index(self, other, func, level=None): index=left.index, columns=self.columns, copy=False) - def _combine_match_columns(self, other, func, level=None, try_cast=True): + def _combine_match_columns(self, other, func, level=None): assert isinstance(other, Series) left, right = self.align(other, join='outer', axis=1, level=level, copy=False) assert left.columns.equals(right.index) return ops.dispatch_to_series(left, right, func, axis="columns") - def _combine_const(self, other, func, errors='raise', try_cast=True): - if lib.is_scalar(other) or np.ndim(other) == 0: - return ops.dispatch_to_series(self, other, func) - - new_data = self._data.eval(func=func, other=other, - errors=errors, - try_cast=try_cast) - return self._constructor(new_data) + def _combine_const(self, other, func): + assert lib.is_scalar(other) or np.ndim(other) == 0 + return ops.dispatch_to_series(self, other, func) def combine(self, other, func, fill_value=None, overwrite=True): """ @@ -5056,7 +5036,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): See Also -------- DataFrame.combine_first : Combine two DataFrame objects and default to - non-null values in frame calling the method + non-null values in frame calling the method. """ other_idxlen = len(other.index) # save for compare @@ -5095,31 +5075,31 @@ def combine(self, other, func, fill_value=None, overwrite=True): series[this_mask] = fill_value otherSeries[other_mask] = fill_value - # if we have different dtypes, possibly promote - new_dtype = this_dtype - if not is_dtype_equal(this_dtype, other_dtype): + if col not in self.columns: + # If self DataFrame does not have col in other DataFrame, + # try to promote series, which is all NaN, as other_dtype. + new_dtype = other_dtype + try: + series = series.astype(new_dtype, copy=False) + except ValueError: + # e.g. new_dtype is integer types + pass + else: + # if we have different dtypes, possibly promote new_dtype = find_common_type([this_dtype, other_dtype]) if not is_dtype_equal(this_dtype, new_dtype): series = series.astype(new_dtype) if not is_dtype_equal(other_dtype, new_dtype): otherSeries = otherSeries.astype(new_dtype) - # see if we need to be represented as i8 (datetimelike) - # try to keep us at this dtype - needs_i8_conversion_i = needs_i8_conversion(new_dtype) - if needs_i8_conversion_i: - arr = func(series, otherSeries, True) - else: - arr = func(series, otherSeries) - + arr = func(series, otherSeries) arr = maybe_downcast_to_dtype(arr, this_dtype) result[col] = arr # convert_objects just in case return self._constructor(result, index=new_index, - columns=new_columns)._convert(datetime=True, - copy=False) + columns=new_columns) def combine_first(self, other): """ @@ -5162,26 +5142,46 @@ def combine_first(self, other): See Also -------- DataFrame.combine : Perform series-wise operation on two DataFrames - using a given function + using a given function. """ import pandas.core.computation.expressions as expressions - def combiner(x, y, needs_i8_conversion=False): - x_values = x.values if hasattr(x, 'values') else x - y_values = y.values if hasattr(y, 'values') else y - if needs_i8_conversion: - mask = isna(x) - x_values = x_values.view('i8') - y_values = y_values.view('i8') - else: - mask = isna(x_values) + def extract_values(arr): + # Does two things: + # 1. maybe gets the values from the Series / Index + # 2. convert datelike to i8 + if isinstance(arr, (ABCIndexClass, ABCSeries)): + arr = arr._values + + if needs_i8_conversion(arr): + # TODO(DatetimelikeArray): just use .asi8 + if is_extension_array_dtype(arr.dtype): + arr = arr.asi8 + else: + arr = arr.view('i8') + return arr + + def combiner(x, y): + mask = isna(x) + if isinstance(mask, (ABCIndexClass, ABCSeries)): + mask = mask._values + + x_values = extract_values(x) + y_values = extract_values(y) + + # If the column y in other DataFrame is not in first DataFrame, + # just return y_values. + if y.name not in self.columns: + return y_values return expressions.where(mask, y_values, x_values) return self.combine(other, combiner, overwrite=False) + @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', + mapping={False: 'ignore', True: 'raise'}) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ Modify in place using non-NA values from another DataFrame. @@ -5205,17 +5205,28 @@ def update(self, other, join='left', overwrite=True, filter_func=None, * False: only update values that are NA in the original DataFrame. - filter_func : callable(1d-array) -> boolean 1d-array, optional + filter_func : callable(1d-array) -> bool 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. - raise_conflict : bool, default False - If True, will raise a ValueError if the DataFrame and `other` + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + Returns + ------- + None : method directly changes calling object + Raises ------ ValueError - When `raise_conflict` is True and there's overlapping non-NA data. + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` See Also -------- @@ -5286,6 +5297,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # TODO: Support other joins if join != 'left': # pragma: no cover raise NotImplementedError("Only left join is supported") + if errors not in ['ignore', 'raise']: + raise ValueError("The parameter errors must be either " + "'ignore' or 'raise'") if not isinstance(other, DataFrame): other = DataFrame(other) @@ -5299,7 +5313,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, with np.errstate(all='ignore'): mask = ~filter_func(this) | isna(that) else: - if raise_conflict: + if errors == 'raise': mask_this = notna(that) mask_that = notna(this) if any(mask_this & mask_that): @@ -5356,9 +5370,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None, See Also -------- - DataFrame.pivot_table : generalization of pivot that can handle + DataFrame.pivot_table : Generalization of pivot that can handle duplicate values for one index/column pair. - DataFrame.unstack : pivot based on the index values instead of a + DataFrame.unstack : Pivot based on the index values instead of a column. Notes @@ -5431,7 +5445,7 @@ def pivot(self, index=None, columns=None, values=None): _shared_docs['pivot_table'] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical - indexes) on the index and columns of the result DataFrame + indexes) on the index and columns of the result DataFrame. Parameters ----------%s @@ -5471,59 +5485,81 @@ def pivot(self, index=None, columns=None, values=None): ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], - ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) >>> df - A B C D - 0 foo one small 1 - 1 foo one large 2 - 2 foo one large 2 - 3 foo two small 3 - 4 foo two small 3 - 5 bar one large 4 - 6 bar one small 5 - 7 bar two small 6 - 8 bar two large 7 + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table C large small A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + bar one 4 5 + two 7 6 + foo one 4 1 + two NaN 6 + + We can also fill missing values using the `fill_value` parameter. >>> table = pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum) + ... columns=['C'], aggfunc=np.sum, fill_value=0) >>> table C large small A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': np.mean}) + >>> table + D E + mean mean + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, ... 'E': [min, max, np.mean]}) >>> table D E - mean max median min + mean max mean min A C - bar large 5.500000 16 14.5 13 - small 5.500000 15 14.5 14 - foo large 2.000000 10 9.5 9 - small 2.333333 12 11.0 8 + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 Returns ------- table : DataFrame - See also + See Also -------- - DataFrame.pivot : pivot without aggregation that can handle - non-numeric data + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. """ @Substitution('') @@ -5711,9 +5747,11 @@ def unstack(self, level=-1, fill_value=None): """ Pivot a level of the (necessarily hierarchical) index labels, returning a DataFrame having a new level of column labels whose inner-most level - consists of the pivoted index labels. If the index is not a MultiIndex, - the output will be a Series (the analogue of stack when the columns are - not a MultiIndex). + consists of the pivoted index labels. + + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + The level involved will automatically get sorted. Parameters @@ -5725,7 +5763,7 @@ def unstack(self, level=-1, fill_value=None): .. versionadded:: 0.18.0 - See also + See Also -------- DataFrame.pivot : Pivot a table based on column values. DataFrame.stack : Pivot a level of the column labels (inverse operation @@ -5769,7 +5807,7 @@ def unstack(self, level=-1, fill_value=None): return unstack(self, level, fill_value) _shared_docs['melt'] = (""" - "Unpivots" a DataFrame from wide format to long format, optionally + Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. This function is useful to massage a DataFrame into a format where one @@ -5795,7 +5833,7 @@ def unstack(self, level=-1, fill_value=None): col_level : int or string, optional If columns are a MultiIndex then use this level to melt. - See also + See Also -------- %(other)s pivot_table @@ -5975,8 +6013,7 @@ def _gotitem(self, ): # type: (...) -> Union[Series, DataFrame] """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -6036,7 +6073,7 @@ def _gotitem(self, 3 NaN dtype: float64 - See also + See Also -------- DataFrame.apply : Perform any type of operations. DataFrame.transform : Perform transformation type operations. @@ -6170,11 +6207,11 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, side-effects, as they will take effect twice for the first column/row. - See also + See Also -------- - DataFrame.applymap: For elementwise operations - DataFrame.aggregate: only perform aggregating type operations - DataFrame.transform: only perform transforming type operations + DataFrame.applymap: For elementwise operations. + DataFrame.aggregate: Only perform aggregating type operations. + DataFrame.transform: Only perform transforming type operations. Examples -------- @@ -6279,9 +6316,9 @@ def applymap(self, func): DataFrame Transformed DataFrame. - See also + See Also -------- - DataFrame.apply : Apply a function along input axis of DataFrame + DataFrame.apply : Apply a function along input axis of DataFrame. Examples -------- @@ -6362,10 +6399,10 @@ def append(self, other, ignore_index=False, those rows to a list and then concatenate the list with the original DataFrame all at once. - See also + See Also -------- pandas.concat : General function to concatenate DataFrame, Series - or Panel objects + or Panel objects. Examples -------- @@ -6462,123 +6499,121 @@ def append(self, other, ignore_index=False, def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): """ - Join columns with other DataFrame either on index or on a key - column. Efficiently Join multiple DataFrame objects by index at once by + Join columns of another DataFrame. + + Join columns with `other` DataFrame either on index or on a key + column. Efficiently join multiple DataFrame objects by index at once by passing a list. Parameters ---------- - other : DataFrame, Series with name field set, or list of DataFrame + other : DataFrame, Series, or list of DataFrame Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be - used as the column name in the resulting joined DataFrame - on : name, tuple/list of names, or array-like + used as the column name in the resulting joined DataFrame. + on : str, list of str, or array-like, optional Column or index level name(s) in the caller to join on the index in `other`, otherwise joins index-on-index. If multiple values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in - the calling DataFrame. Like an Excel VLOOKUP operation - how : {'left', 'right', 'outer', 'inner'}, default: 'left' + the calling DataFrame. Like an Excel VLOOKUP operation. + how : {'left', 'right', 'outer', 'inner'}, default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) - * right: use other frame's index + * right: use `other`'s index. * outer: form union of calling frame's index (or column if on is - specified) with other frame's index, and sort it - lexicographically + specified) with `other`'s index, and sort it. + lexicographically. * inner: form intersection of calling frame's index (or column if - on is specified) with other frame's index, preserving the order - of the calling's one - lsuffix : string - Suffix to use from left frame's overlapping columns - rsuffix : string - Suffix to use from right frame's overlapping columns - sort : boolean, default False + on is specified) with `other`'s index, preserving the order + of the calling's one. + lsuffix : str, default '' + Suffix to use from left frame's overlapping columns. + rsuffix : str, default '' + Suffix to use from right frame's overlapping columns. + sort : bool, default False Order result DataFrame lexicographically by the join key. If False, - the order of the join key depends on the join type (how keyword) + the order of the join key depends on the join type (how keyword). + + Returns + ------- + DataFrame + A dataframe containing columns from both the caller and `other`. Notes ----- - on, lsuffix, and rsuffix options are not supported when passing a list - of DataFrame objects + Parameters `on`, `lsuffix`, and `rsuffix` are not supported when + passing a list of `DataFrame` objects. Support for specifying index levels as the `on` parameter was added - in version 0.23.0 + in version 0.23.0. + + See Also + -------- + DataFrame.merge : For column(s)-on-columns(s) operations. Examples -------- - >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], - ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) - - >>> caller - A key - 0 A0 K0 - 1 A1 K1 - 2 A2 K2 - 3 A3 K3 - 4 A4 K4 - 5 A5 K5 + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df + key A + 0 K0 A0 + 1 K1 A1 + 2 K2 A2 + 3 K3 A3 + 4 K4 A4 + 5 K5 A5 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], ... 'B': ['B0', 'B1', 'B2']}) >>> other - B key - 0 B0 K0 - 1 B1 K1 - 2 B2 K2 + key B + 0 K0 B0 + 1 K1 B1 + 2 K2 B2 Join DataFrames using their indexes. - >>> caller.join(other, lsuffix='_caller', rsuffix='_other') - - >>> A key_caller B key_other - 0 A0 K0 B0 K0 - 1 A1 K1 B1 K1 - 2 A2 K2 B2 K2 - 3 A3 K3 NaN NaN - 4 A4 K4 NaN NaN - 5 A5 K5 NaN NaN - + >>> df.join(other, lsuffix='_caller', rsuffix='_other') + key_caller A key_other B + 0 K0 A0 K0 B0 + 1 K1 A1 K1 B1 + 2 K2 A2 K2 B2 + 3 K3 A3 NaN NaN + 4 K4 A4 NaN NaN + 5 K5 A5 NaN NaN If we want to join using the key columns, we need to set key to be - the index in both caller and other. The joined DataFrame will have + the index in both `df` and `other`. The joined DataFrame will have key as its index. - >>> caller.set_index('key').join(other.set_index('key')) - - >>> A B - key - K0 A0 B0 - K1 A1 B1 - K2 A2 B2 - K3 A3 NaN - K4 A4 NaN - K5 A5 NaN - - Another option to join using the key columns is to use the on - parameter. DataFrame.join always uses other's index but we can use any - column in the caller. This method preserves the original caller's + >>> df.set_index('key').join(other.set_index('key')) + A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the `on` + parameter. DataFrame.join always uses `other`'s index but we can use + any column in `df`. This method preserves the original DataFrame's index in the result. - >>> caller.join(other.set_index('key'), on='key') - - >>> A key B - 0 A0 K0 B0 - 1 A1 K1 B1 - 2 A2 K2 B2 - 3 A3 K3 NaN - 4 A4 K4 NaN - 5 A5 K5 NaN - - - See also - -------- - DataFrame.merge : For column(s)-on-columns(s) operations - - Returns - ------- - joined : DataFrame + >>> df.join(other.set_index('key'), on='key') + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K2 A2 B2 + 3 K3 A3 NaN + 4 K4 A4 NaN + 5 K5 A5 NaN """ # For SparseDataFrame's benefit return self._join_compat(other, on=on, how=how, lsuffix=lsuffix, @@ -6653,6 +6688,15 @@ def round(self, decimals=0, *args, **kwargs): of `decimals` which are not columns of the input will be ignored. + Returns + ------- + DataFrame + + See Also + -------- + numpy.around + Series.round + Examples -------- >>> df = pd.DataFrame(np.random.random([3, 3]), @@ -6678,15 +6722,6 @@ def round(self, decimals=0, *args, **kwargs): first 0.0 1 0.17 second 0.0 1 0.58 third 0.9 0 0.49 - - Returns - ------- - DataFrame object - - See Also - -------- - numpy.around - Series.round """ from pandas.core.reshape.concat import concat @@ -6729,7 +6764,7 @@ def _series_round(s, decimals): def corr(self, method='pearson', min_periods=1): """ - Compute pairwise correlation of columns, excluding NA/null values + Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- @@ -6752,7 +6787,6 @@ def corr(self, method='pearson', min_periods=1): Examples -------- - >>> import numpy as np >>> histogram_intersection = lambda a, b: np.minimum(a, b ... ).sum().round(decimals=1) >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], @@ -6835,10 +6869,10 @@ def cov(self, min_periods=None): See Also -------- - pandas.Series.cov : compute covariance with another Series - pandas.core.window.EWM.cov: exponential weighted sample covariance - pandas.core.window.Expanding.cov : expanding sample covariance - pandas.core.window.Rolling.cov : rolling sample covariance + pandas.Series.cov : Compute covariance with another Series. + pandas.core.window.EWM.cov: Exponential weighted sample covariance. + pandas.core.window.Expanding.cov : Expanding sample covariance. + pandas.core.window.Rolling.cov : Rolling sample covariance. Notes ----- @@ -6995,11 +7029,11 @@ def count(self, axis=0, level=None, numeric_only=False): See Also -------- - Series.count: number of non-NA elements in a Series - DataFrame.shape: number of DataFrame rows and columns (including NA - elements) - DataFrame.isna: boolean same-sized DataFrame showing places of NA - elements + Series.count: Number of non-NA elements in a Series. + DataFrame.shape: Number of DataFrame rows and columns (including NA + elements). + DataFrame.isna: Boolean same-sized DataFrame showing places of NA + elements. Examples -------- @@ -7043,7 +7077,6 @@ def count(self, axis=0, level=None, numeric_only=False): John 2 Lewis 1 Myla 1 - """ axis = self._get_axis_number(axis) if level is not None: @@ -7101,8 +7134,9 @@ def _count_level(self, level, axis=0, numeric_only=False): level = count_axis._get_level_number(level) level_index = count_axis.levels[level] - labels = ensure_int64(count_axis.labels[level]) - counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) + level_codes = ensure_int64(count_axis.codes[level]) + counts = lib.count_level_2d(mask, level_codes, len(level_index), + axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) @@ -7213,32 +7247,43 @@ def f(x): def nunique(self, axis=0, dropna=True): """ - Return Series with number of distinct observations over requested - axis. + Count distinct observations over requested axis. + + Return Series with number of distinct observations. Can ignore NaN + values. .. versionadded:: 0.20.0 Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - dropna : boolean, default True + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True Don't include NaN in the counts. Returns ------- nunique : Series + See Also + -------- + Series.nunique: Method nunique for Series. + DataFrame.count: Count non-NA cells for each column or row. + Examples -------- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) >>> df.nunique() A 3 B 1 + dtype: int64 >>> df.nunique(axis=1) 0 1 1 2 2 2 + dtype: int64 """ return self.apply(Series.nunique, axis=axis, dropna=dropna) @@ -7315,7 +7360,9 @@ def idxmax(self, axis=0, skipna=True): return Series(result, index=self._get_agg_axis(axis)) def _get_agg_axis(self, axis_num): - """ let's be explicit about this """ + """ + Let's be explicit about this. + """ if axis_num == 0: return self.columns elif axis_num == 1: @@ -7505,7 +7552,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, def to_timestamp(self, freq=None, how='start', axis=0, copy=True): """ - Cast to DatetimeIndex of timestamps, at *beginning* of period + Cast to DatetimeIndex of timestamps, at *beginning* of period. Parameters ---------- @@ -7541,7 +7588,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): def to_period(self, freq=None, axis=0, copy=True): """ Convert DataFrame from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed) + frequency (inferred from index if not passed). Parameters ---------- @@ -7678,338 +7725,6 @@ def isin(self, values): ops.add_special_arithmetic_methods(DataFrame) -def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): - """ - Segregate Series based on type and coerce into matrices. - Needs to handle a lot of exceptional cases. - """ - # figure out the index, if necessary - if index is None: - index = extract_index(arrays) - else: - index = ensure_index(index) - - # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) - - # from BlockManager perspective - axes = [ensure_index(columns), index] - - return create_block_manager_from_arrays(arrays, arr_names, axes) - - -def extract_index(data): - from pandas.core.index import _union_indexes - - index = None - if len(data) == 0: - index = Index([]) - elif len(data) > 0: - raw_lengths = [] - indexes = [] - - have_raw_arrays = False - have_series = False - have_dicts = False - - for v in data: - if isinstance(v, Series): - have_series = True - indexes.append(v.index) - elif isinstance(v, dict): - have_dicts = True - indexes.append(list(v.keys())) - elif is_list_like(v) and getattr(v, 'ndim', 1) == 1: - have_raw_arrays = True - raw_lengths.append(len(v)) - - if not indexes and not raw_lengths: - raise ValueError('If using all scalar values, you must pass' - ' an index') - - if have_series or have_dicts: - index = _union_indexes(indexes) - - if have_raw_arrays: - lengths = list(set(raw_lengths)) - if len(lengths) > 1: - raise ValueError('arrays must all be same length') - - if have_dicts: - raise ValueError('Mixing dicts with non-Series may lead to ' - 'ambiguous ordering.') - - if have_series: - if lengths[0] != len(index): - msg = ('array length %d does not match index length %d' % - (lengths[0], len(index))) - raise ValueError(msg) - else: - index = ibase.default_index(lengths[0]) - - return ensure_index(index) - - -def _prep_ndarray(values, copy=True): - if not isinstance(values, (np.ndarray, Series, Index)): - if len(values) == 0: - return np.empty((0, 0), dtype=object) - - def convert(v): - return maybe_convert_platform(v) - - # we could have a 1-dim or 2-dim list here - # this is equiv of np.asarray, but does object conversion - # and platform dtype preservation - try: - if is_list_like(values[0]) or hasattr(values[0], 'len'): - values = np.array([convert(v) for v in values]) - elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: - # GH#21861 - values = np.array([convert(v) for v in values]) - else: - values = convert(values) - except (ValueError, TypeError): - values = convert(values) - - else: - - # drop subclass info, do not copy data - values = np.asarray(values) - if copy: - values = values.copy() - - if values.ndim == 1: - values = values.reshape((values.shape[0], 1)) - elif values.ndim != 2: - raise ValueError('Must pass 2-d input') - - return values - - -def _to_arrays(data, columns, coerce_float=False, dtype=None): - """ - Return list of arrays, columns - """ - if isinstance(data, DataFrame): - if columns is not None: - arrays = [data._ixs(i, axis=1).values - for i, col in enumerate(data.columns) if col in columns] - else: - columns = data.columns - arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] - - return arrays, columns - - if not len(data): - if isinstance(data, np.ndarray): - columns = data.dtype.names - if columns is not None: - return [[]] * len(columns), columns - return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) - elif isinstance(data[0], compat.Mapping): - return _list_of_dict_to_arrays(data, columns, - coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], Series): - return _list_of_series_to_arrays(data, columns, - coerce_float=coerce_float, - dtype=dtype) - elif isinstance(data[0], Categorical): - if columns is None: - columns = ibase.default_index(len(data)) - return data, columns - elif (isinstance(data, (np.ndarray, Series, Index)) and - data.dtype.names is not None): - - columns = list(data.dtype.names) - arrays = [data[k] for k in columns] - return arrays, columns - else: - # last ditch effort - data = lmap(tuple, data) - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) - - -def _masked_rec_array_to_mgr(data, index, columns, dtype, copy): - """ extract from a masked rec array and create the manager """ - - # essentially process a record array then fill it - fill_value = data.fill_value - fdata = ma.getdata(data) - if index is None: - index = _get_names_from_index(fdata) - if index is None: - index = ibase.default_index(len(data)) - index = ensure_index(index) - - if columns is not None: - columns = ensure_index(columns) - arrays, arr_columns = _to_arrays(fdata, columns) - - # fill if needed - new_arrays = [] - for fv, arr, col in zip(fill_value, arrays, arr_columns): - mask = ma.getmaskarray(data[col]) - if mask.any(): - arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) - arr[mask] = fv - new_arrays.append(arr) - - # create the manager - arrays, arr_columns = _reorder_arrays(new_arrays, arr_columns, columns) - if columns is None: - columns = arr_columns - - mgr = _arrays_to_mgr(arrays, arr_columns, index, columns) - - if copy: - mgr = mgr.copy() - return mgr - - -def _reorder_arrays(arrays, arr_columns, columns): - # reorder according to the columns - if (columns is not None and len(columns) and arr_columns is not None and - len(arr_columns)): - indexer = ensure_index(arr_columns).get_indexer(columns) - arr_columns = ensure_index([arr_columns[i] for i in indexer]) - arrays = [arrays[i] for i in indexer] - return arrays, arr_columns - - -def _list_to_arrays(data, columns, coerce_float=False, dtype=None): - if len(data) > 0 and isinstance(data[0], tuple): - content = list(lib.to_object_array_tuples(data).T) - else: - # list of lists - content = list(lib.to_object_array(data).T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) - - -def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): - from pandas.core.index import _get_objs_combined_axis - - if columns is None: - columns = _get_objs_combined_axis(data, sort=False) - - indexer_cache = {} - - aligned_values = [] - for s in data: - index = getattr(s, 'index', None) - if index is None: - index = ibase.default_index(len(s)) - - if id(index) in indexer_cache: - indexer = indexer_cache[id(index)] - else: - indexer = indexer_cache[id(index)] = index.get_indexer(columns) - - values = com.values_from_object(s) - aligned_values.append(algorithms.take_1d(values, indexer)) - - values = np.vstack(aligned_values) - - if values.dtype == np.object_: - content = list(values.T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) - else: - return values.T, columns - - -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): - if columns is None: - gen = (list(x.keys()) for x in data) - sort = not any(isinstance(d, OrderedDict) for d in data) - columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) - - # assure that they are of the base dict class and not of derived - # classes - data = [(type(d) is dict) and d or dict(d) for d in data] - - content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) - - -def _convert_object_array(content, columns, coerce_float=False, dtype=None): - if columns is None: - columns = ibase.default_index(len(content)) - else: - if len(columns) != len(content): # pragma: no cover - # caller's responsibility to check for this... - raise AssertionError('{col:d} columns passed, passed data had ' - '{con} columns'.format(col=len(columns), - con=len(content))) - - # provide soft conversion of object dtypes - def convert(arr): - if dtype != object and dtype != np.object: - arr = lib.maybe_convert_objects(arr, try_float=coerce_float) - arr = maybe_cast_to_datetime(arr, dtype) - return arr - - arrays = [convert(arr) for arr in content] - - return arrays, columns - - -def _get_names_from_index(data): - has_some_name = any(getattr(s, 'name', None) is not None for s in data) - if not has_some_name: - return ibase.default_index(len(data)) - - index = lrange(len(data)) - count = 0 - for i, s in enumerate(data): - n = getattr(s, 'name', None) - if n is not None: - index[i] = n - else: - index[i] = 'Unnamed %d' % count - count += 1 - - return index - - -def _homogenize(data, index, dtype=None): - from pandas.core.series import _sanitize_array - - oindex = None - homogenized = [] - - for v in data: - if isinstance(v, Series): - if dtype is not None: - v = v.astype(dtype) - if v.index is not index: - # Forces alignment. No need to copy data since we - # are putting it into an ndarray later - v = v.reindex(index, copy=False) - else: - if isinstance(v, dict): - if oindex is None: - oindex = index.astype('O') - - if isinstance(index, (DatetimeIndex, TimedeltaIndex)): - v = com.dict_compat(v) - else: - v = dict(v) - v = lib.fast_multiget(v, oindex.values, default=np.nan) - v = _sanitize_array(v, index, dtype=dtype, copy=False, - raise_cast_failure=False) - - homogenized.append(v) - - return homogenized - - def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = OrderedDict() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8de52fbfa79f0..b3cb5c3be67f9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,62 +1,52 @@ # pylint: disable=W0231,E1101 import collections import functools -import warnings -import operator -import weakref import gc import json +import operator +import warnings +import weakref import numpy as np -import pandas as pd -from pandas._libs import tslib, properties -from pandas.core.dtypes.common import ( - ensure_int64, - ensure_object, - is_scalar, - is_number, - is_integer, is_bool, - is_bool_dtype, - is_categorical_dtype, - is_numeric_dtype, - is_datetime64_any_dtype, - is_timedelta64_dtype, - is_datetime64tz_dtype, - is_list_like, - is_dict_like, - is_re_compilable, - is_period_arraylike, - is_object_dtype, - pandas_dtype) +from pandas._libs import Timestamp, iNaT, properties +import pandas.compat as compat +from pandas.compat import ( + cPickle as pkl, isidentifier, lrange, lzip, map, set_function_name, + string_types, to_str, zip) +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import ( + Appender, Substitution, rewrite_axis_style_signature) +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs + from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.common import ( + ensure_int64, ensure_object, is_bool, is_bool_dtype, + is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, + is_extension_array_dtype, is_integer, is_list_like, is_number, + is_numeric_dtype, is_object_dtype, is_period_arraylike, is_re_compilable, + is_scalar, is_timedelta64_dtype, pandas_dtype) +from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame +import pandas as pd +from pandas.core import config, missing, nanops +import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin -from pandas.core.index import (Index, MultiIndex, ensure_index, - InvalidIndexError, RangeIndex) -import pandas.core.indexing as indexing +import pandas.core.common as com +from pandas.core.index import ( + Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index) from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex, Period +from pandas.core.indexes.period import Period, PeriodIndex +import pandas.core.indexing as indexing from pandas.core.internals import BlockManager -import pandas.core.algorithms as algos -import pandas.core.common as com -import pandas.core.missing as missing +from pandas.core.ops import _align_method_FRAME + +from pandas.io.formats.format import DataFrameFormatter, format_percentiles from pandas.io.formats.printing import pprint_thing -from pandas.io.formats.format import format_percentiles, DataFrameFormatter from pandas.tseries.frequencies import to_offset -from pandas import compat -from pandas.compat.numpy import function as nv -from pandas.compat import (map, zip, lzip, lrange, string_types, to_str, - isidentifier, set_function_name, cPickle as pkl) -from pandas.core.ops import _align_method_FRAME -import pandas.core.nanops as nanops -from pandas.util._decorators import (Appender, Substitution, - deprecate_kwarg) -from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core import config # goal is to be able to define the docs close to function, while still being # able to share @@ -115,12 +105,19 @@ class NDFrame(PandasObject, SelectionMixin): '_default_fill_value', '_metadata', '__array_struct__', '__array_interface__'] _internal_names_set = set(_internal_names) - _accessors = frozenset([]) + _accessors = frozenset() _deprecations = frozenset(['as_blocks', 'blocks', - 'consolidate', 'convert_objects', 'is_copy']) + 'convert_objects', 'is_copy']) _metadata = [] _is_copy = None + # dummy attribute so that datetime.__eq__(Series/DataFrame) defers + # by returning NotImplemented + timetuple = None + + # ---------------------------------------------------------------------- + # Constructors + def __init__(self, data, axes=None, copy=False, dtype=None, fastpath=False): @@ -138,8 +135,30 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) + def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): + """ passed a manager and a axes dict """ + for a, axe in axes.items(): + if axe is not None: + mgr = mgr.reindex_axis(axe, + axis=self._get_block_manager_axis(a), + copy=False) + + # make a copy if explicitly requested + if copy: + mgr = mgr.copy() + if dtype is not None: + # avoid further copies if we can + if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: + mgr = mgr.astype(dtype=dtype) + return mgr + + # ---------------------------------------------------------------------- + @property def is_copy(self): + """ + Return the copy. + """ warnings.warn("Attribute 'is_copy' is deprecated and will be removed " "in a future version.", FutureWarning, stacklevel=2) return self._is_copy @@ -150,17 +169,6 @@ def is_copy(self, msg): "in a future version.", FutureWarning, stacklevel=2) self._is_copy = msg - def _repr_data_resource_(self): - """ - Not a real Jupyter special repr method, but we use the same - naming convention. - """ - if config.get_option("display.html.table_schema"): - data = self.head(config.get_option('display.max_rows')) - payload = json.loads(data.to_json(orient='table'), - object_pairs_hook=collections.OrderedDict) - return payload - def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -175,23 +183,6 @@ def _validate_dtype(self, dtype): return dtype - def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): - """ passed a manager and a axes dict """ - for a, axe in axes.items(): - if axe is not None: - mgr = mgr.reindex_axis(axe, - axis=self._get_block_manager_axis(a), - copy=False) - - # make a copy if explicitly requested - if copy: - mgr = mgr.copy() - if dtype is not None: - # avoid further copies if we can - if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: - mgr = mgr.astype(dtype=dtype) - return mgr - # ---------------------------------------------------------------------- # Construction @@ -200,28 +191,14 @@ def _constructor(self): """Used when a manipulation result has the same dimensions as the original. """ - raise com.AbstractMethodError(self) - - def __unicode__(self): - # unicode representation based upon iterating over self - # (since, by definition, `PandasContainers` are iterable) - prepr = '[%s]' % ','.join(map(pprint_thing, self)) - return '%s(%s)' % (self.__class__.__name__, prepr) - - def _dir_additions(self): - """ add the string-like attributes from the info_axis. - If info_axis is a MultiIndex, it's first level values are used. - """ - additions = {c for c in self._info_axis.unique(level=0)[:100] - if isinstance(c, string_types) and isidentifier(c)} - return super(NDFrame, self)._dir_additions().union(additions) + raise AbstractMethodError(self) @property def _constructor_sliced(self): """Used when a manipulation result has one lower dimension(s) as the original, such as DataFrame single columns slicing. """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) @property def _constructor_expanddim(self): @@ -358,41 +335,44 @@ def _from_axes(cls, data, axes, **kwargs): d.update(kwargs) return cls(data, **d) - def _get_axis_number(self, axis): - axis = self._AXIS_ALIASES.get(axis, axis) + @classmethod + def _get_axis_number(cls, axis): + axis = cls._AXIS_ALIASES.get(axis, axis) if is_integer(axis): - if axis in self._AXIS_NAMES: + if axis in cls._AXIS_NAMES: return axis else: try: - return self._AXIS_NUMBERS[axis] + return cls._AXIS_NUMBERS[axis] except KeyError: pass raise ValueError('No axis named {0} for object type {1}' - .format(axis, type(self))) + .format(axis, type(cls))) - def _get_axis_name(self, axis): - axis = self._AXIS_ALIASES.get(axis, axis) + @classmethod + def _get_axis_name(cls, axis): + axis = cls._AXIS_ALIASES.get(axis, axis) if isinstance(axis, string_types): - if axis in self._AXIS_NUMBERS: + if axis in cls._AXIS_NUMBERS: return axis else: try: - return self._AXIS_NAMES[axis] + return cls._AXIS_NAMES[axis] except KeyError: pass raise ValueError('No axis named {0} for object type {1}' - .format(axis, type(self))) + .format(axis, type(cls))) def _get_axis(self, axis): name = self._get_axis_name(axis) return getattr(self, name) - def _get_block_manager_axis(self, axis): + @classmethod + def _get_block_manager_axis(cls, axis): """Map the axis to the block_manager axis.""" - axis = self._get_axis_number(axis) - if self._AXIS_REVERSED: - m = self._AXIS_LEN - 1 + axis = cls._get_axis_number(axis) + if cls._AXIS_REVERSED: + m = cls._AXIS_LEN - 1 return m - axis return axis @@ -442,12 +422,16 @@ def _stat_axis(self): @property def shape(self): - """Return a tuple of axis dimensions""" + """ + Return a tuple of axis dimensions + """ return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS) @property def axes(self): - """Return index label(s) of the internal NDFrame""" + """ + Return index label(s) of the internal NDFrame + """ # we do it this way because if we have reversed axes, then # the block manager shows then reversed return [self._get_axis(a) for a in self._AXIS_ORDERS] @@ -560,7 +544,7 @@ def set_axis(self, labels, axis=0, inplace=None): See Also -------- - pandas.DataFrame.rename_axis : Alter the name of the index or columns. + DataFrame.rename_axis : Alter the name of the index or columns. Examples -------- @@ -712,7 +696,8 @@ def swapaxes(self, axis1, axis2, copy=True): return self._constructor(new_values, *new_axes).__finalize__(self) def droplevel(self, level, axis=0): - """Return DataFrame with requested index / column level(s) removed. + """ + Return DataFrame with requested index / column level(s) removed. .. versionadded:: 0.24.0 @@ -725,7 +710,6 @@ def droplevel(self, level, axis=0): axis : {0 or 'index', 1 or 'columns'}, default 0 - Returns ------- DataFrame.droplevel() @@ -764,7 +748,6 @@ def droplevel(self, level, axis=0): 1 2 3 4 5 6 7 8 9 10 11 12 - """ labels = self._get_axis(axis) new_labels = labels.droplevel(level) @@ -849,8 +832,8 @@ def squeeze(self, axis=None): See Also -------- - Series.iloc : Integer-location based indexing for selecting scalars - DataFrame.iloc : Integer-location based indexing for selecting Series + Series.iloc : Integer-location based indexing for selecting scalars. + DataFrame.iloc : Integer-location based indexing for selecting Series. Series.to_frame : Inverse of DataFrame.squeeze for a single-column DataFrame. @@ -1080,20 +1063,6 @@ def rename(self, *args, **kwargs): if com.count_not_none(*axes.values()) == 0: raise TypeError('must pass an index to rename') - # renamer function if passed a dict - def _get_rename_function(mapper): - if isinstance(mapper, (dict, ABCSeries)): - - def f(x): - if x in mapper: - return mapper[x] - else: - return x - else: - f = mapper - - return f - self._consolidate_inplace() result = self if inplace else self.copy(deep=copy) @@ -1102,7 +1071,7 @@ def f(x): v = axes.get(self._AXIS_NAMES[axis]) if v is None: continue - f = _get_rename_function(v) + f = com._get_rename_function(v) baxis = self._get_block_manager_axis(axis) if level is not None: @@ -1116,27 +1085,45 @@ def f(x): else: return result.__finalize__(self) - def rename_axis(self, mapper, axis=0, copy=True, inplace=False): + @rewrite_axis_style_signature('mapper', [('copy', True), + ('inplace', False)]) + def rename_axis(self, mapper=None, **kwargs): """ - Alter the name of the index or columns. + Set the name of the axis for the index or columns. Parameters ---------- mapper : scalar, list-like, optional - Value to set as the axis name attribute. + Value to set the axis name attribute. + index, columns : scalar, list-like, dict-like or function, optional + A scalar, list-like, dict-like or functions transformations to + apply to that axis' values. + + Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` + and/or ``columns``. + + .. versionchanged:: 0.24.0 + axis : {0 or 'index', 1 or 'columns'}, default 0 - The index or the name of the axis. - copy : boolean, default True + The axis to rename. + copy : bool, default True Also copy underlying data. - inplace : boolean, default False + inplace : bool, default False Modifies the object directly, instead of creating a new Series or DataFrame. Returns ------- - renamed : Series, DataFrame, or None + Series, DataFrame, or None The same type as the caller or None if `inplace` is True. + See Also + -------- + Series.rename : Alter Series index labels or name. + DataFrame.rename : Alter DataFrame index labels or name. + Index.rename : Set new names on index. + Notes ----- Prior to version 0.21.0, ``rename_axis`` could also be used to change @@ -1144,95 +1131,195 @@ def rename_axis(self, mapper, axis=0, copy=True, inplace=False): deprecated and will be removed in a future version. Use ``rename`` instead. - See Also - -------- - pandas.Series.rename : Alter Series index labels or name - pandas.DataFrame.rename : Alter DataFrame index labels or name - pandas.Index.rename : Set new names on index + ``DataFrame.rename_axis`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + The first calling convention will only modify the names of + the index and/or the names of the Index object that is the columns. + In this case, the parameter ``copy`` is ignored. + + The second calling convention will modify the names of the + the corresponding index if mapper is a list or a scalar. + However, if mapper is dict-like or a function, it will use the + deprecated behavior of modifying the axis *labels*. + + We *highly* recommend using keyword arguments to clarify your + intent. Examples -------- **Series** - >>> s = pd.Series([1, 2, 3]) - >>> s.rename_axis("foo") - foo - 0 1 - 1 2 - 2 3 - dtype: int64 + >>> s = pd.Series(["dog", "cat", "monkey"]) + >>> s + 0 dog + 1 cat + 2 monkey + dtype: object + >>> s.rename_axis("animal") + animal + 0 dog + 1 cat + 2 monkey + dtype: object **DataFrame** - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - >>> df.rename_axis("foo") - A B - foo - 0 1 4 - 1 2 5 - 2 3 6 - - >>> df.rename_axis("bar", axis="columns") - bar A B - 0 1 4 - 1 2 5 - 2 3 6 - """ + >>> df = pd.DataFrame({"num_legs": [4, 4, 2], + ... "num_arms": [0, 0, 2]}, + ... ["dog", "cat", "monkey"]) + >>> df + num_legs num_arms + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("animal") + >>> df + num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("limbs", axis="columns") + >>> df + limbs num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + + **MultiIndex** + + >>> df.index = pd.MultiIndex.from_product([['mammal'], + ... ['dog', 'cat', 'monkey']], + ... names=['type', 'name']) + >>> df + limbs num_legs num_arms + type name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + + >>> df.rename_axis(index={'type': 'class'}) + limbs num_legs num_arms + class name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + + >>> df.rename_axis(columns=str.upper) + LIMBS num_legs num_arms + type name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + """ + axes, kwargs = self._construct_axes_from_arguments((), kwargs) + copy = kwargs.pop('copy', True) + inplace = kwargs.pop('inplace', False) + axis = kwargs.pop('axis', 0) + if axis is not None: + axis = self._get_axis_number(axis) + + if kwargs: + raise TypeError('rename_axis() got an unexpected keyword ' + 'argument "{0}"'.format(list(kwargs.keys())[0])) + inplace = validate_bool_kwarg(inplace, 'inplace') - non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not - is_dict_like(mapper)) - if non_mapper: - return self._set_axis_name(mapper, axis=axis, inplace=inplace) + + if (mapper is not None): + # Use v0.23 behavior if a scalar or list + non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not + is_dict_like(mapper)) + if non_mapper: + return self._set_axis_name(mapper, axis=axis, inplace=inplace) + else: + # Deprecated (v0.21) behavior is if mapper is specified, + # and not a list or scalar, then call rename + msg = ("Using 'rename_axis' to alter labels is deprecated. " + "Use '.rename' instead") + warnings.warn(msg, FutureWarning, stacklevel=3) + axis = self._get_axis_name(axis) + d = {'copy': copy, 'inplace': inplace} + d[axis] = mapper + return self.rename(**d) else: - msg = ("Using 'rename_axis' to alter labels is deprecated. " - "Use '.rename' instead") - warnings.warn(msg, FutureWarning, stacklevel=2) - axis = self._get_axis_name(axis) - d = {'copy': copy, 'inplace': inplace} - d[axis] = mapper - return self.rename(**d) + # Use new behavior. Means that index and/or columns + # is specified + result = self if inplace else self.copy(deep=copy) + + for axis in lrange(self._AXIS_LEN): + v = axes.get(self._AXIS_NAMES[axis]) + if v is None: + continue + non_mapper = is_scalar(v) or (is_list_like(v) and not + is_dict_like(v)) + if non_mapper: + newnames = v + else: + f = com._get_rename_function(v) + curnames = self._get_axis(axis).names + newnames = [f(name) for name in curnames] + result._set_axis_name(newnames, axis=axis, + inplace=True) + if not inplace: + return result def _set_axis_name(self, name, axis=0, inplace=False): """ - Alter the name or names of the axis. + Set the name(s) of the axis. Parameters ---------- name : str or list of str - Name for the Index, or list of names for the MultiIndex - axis : int or str - 0 or 'index' for the index; 1 or 'columns' for the columns - inplace : bool - whether to modify `self` directly or return a copy + Name(s) to set. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to set the label. The value 0 or 'index' specifies index, + and the value 1 or 'columns' specifies columns. + inplace : bool, default False + If `True`, do operation inplace and return None. .. versionadded:: 0.21.0 Returns ------- - renamed : same type as caller or None if inplace=True + Series, DataFrame, or None + The same type as the caller or `None` if `inplace` is `True`. See Also -------- - pandas.DataFrame.rename - pandas.Series.rename - pandas.Index.rename + DataFrame.rename : Alter the axis labels of :class:`DataFrame`. + Series.rename : Alter the index labels or set the index name + of :class:`Series`. + Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`. Examples -------- - >>> df._set_axis_name("foo") - A - foo - 0 1 - 1 2 - 2 3 - >>> df.index = pd.MultiIndex.from_product([['A'], ['a', 'b', 'c']]) - >>> df._set_axis_name(["bar", "baz"]) - A - bar baz - A a 1 - b 2 - c 3 - """ + >>> df = pd.DataFrame({"num_legs": [4, 4, 2]}, + ... ["dog", "cat", "monkey"]) + >>> df + num_legs + dog 4 + cat 4 + monkey 2 + >>> df._set_axis_name("animal") + num_legs + animal + dog 4 + cat 4 + monkey 2 + >>> df.index = pd.MultiIndex.from_product( + ... [["mammal"], ['dog', 'cat', 'monkey']]) + >>> df._set_axis_name(["type", "name"]) + legs + type name + mammal dog 4 + cat 4 + monkey 2 + """ + pd.MultiIndex.from_product([["mammal"], ['dog', 'cat', 'monkey']]) axis = self._get_axis_number(axis) idx = self._get_axis(axis).set_names(name) @@ -1243,48 +1330,12 @@ def _set_axis_name(self, name, axis=0, inplace=False): return renamed # ---------------------------------------------------------------------- - # Comparisons + # Comparison Methods def _indexed_same(self, other): return all(self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS) - def __neg__(self): - values = com.values_from_object(self) - if is_bool_dtype(values): - arr = operator.inv(values) - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): - arr = operator.neg(values) - else: - raise TypeError("Unary negative expects numeric dtype, not {}" - .format(values.dtype)) - return self.__array_wrap__(arr) - - def __pos__(self): - values = com.values_from_object(self) - if (is_bool_dtype(values) or is_period_arraylike(values)): - arr = values - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): - arr = operator.pos(values) - else: - raise TypeError("Unary plus expects numeric dtype, not {}" - .format(values.dtype)) - return self.__array_wrap__(arr) - - def __invert__(self): - try: - arr = operator.inv(com.values_from_object(self)) - return self.__array_wrap__(arr) - except Exception: - - # inv fails with 0 len - if not np.prod(self.shape): - return self - - raise - def equals(self, other): """ Test whether two objects contain the same elements. @@ -1371,6 +1422,75 @@ def equals(self, other): return False return self._data.equals(other._data) + # ------------------------------------------------------------------------- + # Unary Methods + + def __neg__(self): + values = com.values_from_object(self) + if is_bool_dtype(values): + arr = operator.inv(values) + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): + arr = operator.neg(values) + else: + raise TypeError("Unary negative expects numeric dtype, not {}" + .format(values.dtype)) + return self.__array_wrap__(arr) + + def __pos__(self): + values = com.values_from_object(self) + if (is_bool_dtype(values) or is_period_arraylike(values)): + arr = values + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): + arr = operator.pos(values) + else: + raise TypeError("Unary plus expects numeric dtype, not {}" + .format(values.dtype)) + return self.__array_wrap__(arr) + + def __invert__(self): + try: + arr = operator.inv(com.values_from_object(self)) + return self.__array_wrap__(arr) + except Exception: + + # inv fails with 0 len + if not np.prod(self.shape): + return self + + raise + + def __nonzero__(self): + raise ValueError("The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + .format(self.__class__.__name__)) + + __bool__ = __nonzero__ + + def bool(self): + """ + Return the bool of a single element PandasObject. + + This must be a boolean scalar value, either True or False. Raise a + ValueError if the PandasObject does not have exactly 1 element, or that + element is not boolean + """ + v = self.squeeze() + if isinstance(v, (bool, np.bool_)): + return bool(v) + elif is_scalar(v): + raise ValueError("bool cannot act on a non-boolean single element " + "{0}".format(self.__class__.__name__)) + + self.__nonzero__() + + def __abs__(self): + return self.abs() + + def __round__(self, decimals=0): + return self.round(decimals) + # ------------------------------------------------------------------------- # Label or Level Combination Helpers # @@ -1391,14 +1511,14 @@ def _is_level_reference(self, key, axis=0): Parameters ---------- - key: str + key : str Potential level name for the given axis - axis: int, default 0 + axis : int, default 0 Axis that levels are associated with (0 for index, 1 for columns) Returns ------- - is_level: bool + is_level : bool """ axis = self._get_axis_number(axis) @@ -1756,45 +1876,20 @@ def empty(self): >>> df.dropna().empty True - See also + See Also -------- pandas.Series.dropna pandas.DataFrame.dropna """ return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS) - def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) - - __bool__ = __nonzero__ - - def bool(self): - """Return the bool of a single element PandasObject. - - This must be a boolean scalar value, either True or False. Raise a - ValueError if the PandasObject does not have exactly 1 element, or that - element is not boolean - """ - v = self.squeeze() - if isinstance(v, (bool, np.bool_)): - return bool(v) - elif is_scalar(v): - raise ValueError("bool cannot act on a non-boolean single element " - "{0}".format(self.__class__.__name__)) - - self.__nonzero__() - - def __abs__(self): - return self.abs() - - def __round__(self, decimals=0): - return self.round(decimals) - # ---------------------------------------------------------------------- # Array Interface + # This is also set in IndexOpsMixin + # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented + __array_priority__ = 1000 + def __array__(self, dtype=None): return com.values_from_object(self) @@ -1811,7 +1906,9 @@ def __array_wrap__(self, result, context=None): # return dict(typestr=values.dtype.str,shape=values.shape,data=values) def to_dense(self): - """Return dense representation of NDFrame (as opposed to sparse)""" + """ + Return dense representation of NDFrame (as opposed to sparse) + """ # compat return self @@ -1863,7 +1960,13 @@ def __setstate__(self, state): self._item_cache = {} # ---------------------------------------------------------------------- - # IO + # Rendering Methods + + def __unicode__(self): + # unicode representation based upon iterating over self + # (since, by definition, `PandasContainers` are iterable) + prepr = '[%s]' % ','.join(map(pprint_thing, self)) + return '%s(%s)' % (self.__class__.__name__, prepr) def _repr_latex_(self): """ @@ -1875,20 +1978,32 @@ def _repr_latex_(self): else: return None + def _repr_data_resource_(self): + """ + Not a real Jupyter special repr method, but we use the same + naming convention. + """ + if config.get_option("display.html.table_schema"): + data = self.head(config.get_option('display.max_rows')) + payload = json.loads(data.to_json(orient='table'), + object_pairs_hook=collections.OrderedDict) + return payload + # ---------------------------------------------------------------------- # I/O Methods _shared_docs['to_excel'] = """ - Write %(klass)s to an excel sheet. + Write %(klass)s to an Excel sheet. - To write a single %(klass)s to an excel .xlsx file it is only necessary to + To write a single %(klass)s to an Excel .xlsx file it is only necessary to specify a target file name. To write to multiple sheets it is necessary to create an `ExcelWriter` object with a target file name, and specify a sheet - in the file to write to. Multiple sheets may be written to by - specifying unique `sheet_name`. With all data written to the file it is - necessary to save the changes. Note that creating an ExcelWriter object - with a file name that already exists will result in the contents of the - existing file being erased. + in the file to write to. + + Multiple sheets may be written to by specifying unique `sheet_name`. + With all data written to the file it is necessary to save the changes. + Note that creating an `ExcelWriter` object with a file name that already + exists will result in the contents of the existing file being erased. Parameters ---------- @@ -1946,8 +2061,8 @@ def _repr_latex_(self): See Also -------- - pandas.read_excel - pandas.ExcelWriter + read_excel + ExcelWriter Examples -------- @@ -1957,17 +2072,18 @@ def _repr_latex_(self): >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], ... index=['row 1', 'row 2'], ... columns=['col 1', 'col 2']) - >>> df1.to_excel("output.xlsx") + >>> df1.to_excel("output.xlsx") # doctest: +SKIP To specify the sheet name: - >>> df1.to_excel("output.xlsx", sheet_name='Sheet_name_1') + >>> df1.to_excel("output.xlsx", + ... sheet_name='Sheet_name_1') # doctest: +SKIP If you wish to write to more than one sheet in the workbook, it is necessary to specify an ExcelWriter object: >>> df2 = df1.copy() - >>> with pd.ExcelWriter('output.xlsx') as writer: + >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP ... df1.to_excel(writer, sheet_name='Sheet_name_1') ... df2.to_excel(writer, sheet_name='Sheet_name_2') @@ -1975,10 +2091,28 @@ def _repr_latex_(self): you can pass the `engine` keyword (the default engine is automatically chosen depending on the file extension): - >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') - + >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP """ + @Appender(_shared_docs["to_excel"] % dict(klass="object")) + def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", + float_format=None, columns=None, header=True, index=True, + index_label=None, startrow=0, startcol=0, engine=None, + merge_cells=True, encoding=None, inf_rep="inf", verbose=True, + freeze_panes=None): + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + from pandas.io.formats.excel import ExcelFormatter + formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns, + header=header, + float_format=float_format, index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep) + formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, + startcol=startcol, freeze_panes=freeze_panes, + engine=engine) + def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False, compression='infer', @@ -2066,7 +2200,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, See Also -------- - pandas.read_json + read_json Examples -------- @@ -2225,14 +2359,13 @@ def to_hdf(self, path_or_buf, key, **kwargs): >>> import os >>> os.remove('data.h5') - """ from pandas.io import pytables return pytables.to_hdf(path_or_buf, key, self, **kwargs) def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): """ - msgpack (serialize) object to input file path + Serialize object to input file path using msgpack format. THIS IS AN EXPERIMENTAL LIBRARY and the storage format may not be stable until a future release. @@ -2299,7 +2432,16 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, See Also -------- - pandas.read_sql : read a DataFrame from a table + read_sql : Read a DataFrame from a table. + + Notes + ----- + Timezone aware datetime columns will be written as + ``Timestamp with timezone`` type with SQLAlchemy if supported by the + database. Otherwise, the datetimes will be stored as timezone unaware + timestamps local to the original timezone. + + .. versionadded:: 0.24.0 References ---------- @@ -2713,6 +2855,148 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, if buf is None: return formatter.buf.getvalue() + def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, + columns=None, header=True, index=True, index_label=None, + mode='w', encoding=None, compression='infer', quoting=None, + quotechar='"', line_terminator=None, chunksize=None, + tupleize_cols=None, date_format=None, doublequote=True, + escapechar=None, decimal='.'): + r""" + Write object to a comma-separated values (csv) file. + + .. versionchanged:: 0.24.0 + The order of arguments for Series was changed. + + Parameters + ---------- + path_or_buf : str or file handle, default None + File path or object, if None is provided the result is returned as + a string. + + .. versionchanged:: 0.24.0 + + Was previously named "path" for Series. + + sep : str, default ',' + String of length 1. Field delimiter for the output file. + na_rep : str, default '' + Missing data representation. + float_format : str, default None + Format string for floating point numbers. + columns : sequence, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of strings is given it is + assumed to be aliases for the column names. + + .. versionchanged:: 0.24.0 + + Previously defaulted to False for Series. + + index : bool, default True + Write row names (index). + index_label : str or sequence, or False, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the object uses MultiIndex. If + False do not print fields for index names. Use index_label=False + for easier importing in R. + mode : str + Python write mode, default 'w'. + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. + compression : str, default 'infer' + Compression mode among the following possible values: {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` + is path-like, then detect compression from the following + extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no + compression). + + .. versionchanged:: 0.24.0 + + 'infer' option added and set to default. + + quoting : optional constant from csv module + Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` + then floats are converted to strings and thus csv.QUOTE_NONNUMERIC + will treat them as non-numeric. + quotechar : str, default '\"' + String of length 1. Character used to quote fields. + line_terminator : string, optional + The newline character or character sequence to use in the output + file. Defaults to `os.linesep`, which depends on the OS in which + this method is called ('\n' for linux, '\r\n' for Windows, i.e.). + + .. versionchanged:: 0.24.0 + chunksize : int or None + Rows to write at a time. + tupleize_cols : bool, default False + Write MultiIndex columns as a list of tuples (if True) or in + the new, expanded format, where each MultiIndex column is a row + in the CSV (if False). + + .. deprecated:: 0.21.0 + This argument will be removed and will always write each row + of the multi-index as a separate row in the CSV file. + date_format : str, default None + Format string for datetime objects. + doublequote : bool, default True + Control quoting of `quotechar` inside a field. + escapechar : str, default None + String of length 1. Character used to escape `sep` and `quotechar` + when appropriate. + decimal : str, default '.' + Character recognized as decimal separator. E.g. use ',' for + European data. + + Returns + ------- + None or str + If path_or_buf is None, returns the resulting csv format as a + string. Otherwise returns None. + + See Also + -------- + read_csv : Load a CSV file into a DataFrame. + to_excel : Load an Excel file into a DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], + ... 'mask': ['red', 'purple'], + ... 'weapon': ['sai', 'bo staff']}) + >>> df.to_csv(index=False) + 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + """ + + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + if tupleize_cols is not None: + warnings.warn("The 'tupleize_cols' parameter is deprecated and " + "will be removed in a future version", + FutureWarning, stacklevel=2) + else: + tupleize_cols = False + + from pandas.io.formats.csvs import CSVFormatter + formatter = CSVFormatter(df, path_or_buf, + line_terminator=line_terminator, sep=sep, + encoding=encoding, + compression=compression, quoting=quoting, + na_rep=na_rep, float_format=float_format, + cols=columns, header=header, index=index, + index_label=index_label, mode=mode, + chunksize=chunksize, quotechar=quotechar, + tupleize_cols=tupleize_cols, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, decimal=decimal) + formatter.save() + + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + # ---------------------------------------------------------------------- # Fancy Indexing @@ -2779,7 +3063,7 @@ def _iget_item_cache(self, item): return lower def _box_item_values(self, key, values): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _maybe_cache_changed(self, item, value): """The object has called back to us saying maybe it has changed. @@ -3151,72 +3435,102 @@ class max_speed def xs(self, key, axis=0, level=None, drop_level=True): """ - Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. Defaults to cross-section on the rows (axis=0). + Return cross-section from the Series/DataFrame. + + This method takes a `key` argument to select data at a particular + level of a MultiIndex. Parameters ---------- - key : object - Some label contained in the index, or partially in a MultiIndex - axis : int, default 0 - Axis to retrieve cross-section on + key : label or tuple of label + Label contained in the index, or partially in a MultiIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis to retrieve cross-section on. level : object, defaults to first n levels (n=1 or len(key)) In case of a key partially contained in a MultiIndex, indicate which levels are used. Levels can be referred by label or position. - drop_level : boolean, default True + drop_level : bool, default True If False, returns object with same levels as self. - Examples - -------- - >>> df - A B C - a 4 5 2 - b 4 0 9 - c 9 7 3 - >>> df.xs('a') - A 4 - B 5 - C 2 - Name: a - >>> df.xs('C', axis=1) - a 2 - b 9 - c 3 - Name: C - - >>> df - A B C D - first second third - bar one 1 4 1 8 9 - two 1 7 5 5 0 - baz one 1 6 6 8 0 - three 2 5 3 5 3 - >>> df.xs(('baz', 'three')) - A B C D - third - 2 5 3 5 3 - >>> df.xs('one', level=1) - A B C D - first third - bar 1 4 1 8 9 - baz 1 6 6 8 0 - >>> df.xs(('baz', 2), level=[0, 'third']) - A B C D - second - three 5 3 5 3 - Returns ------- - xs : Series or DataFrame + Series or DataFrame + Cross-section from the original Series or DataFrame + corresponding to the selected index levels. + + See Also + -------- + DataFrame.loc : Access a group of rows and columns + by label(s) or a boolean array. + DataFrame.iloc : Purely integer-location based indexing + for selection by position. Notes ----- - xs is only for getting, not setting values. + `xs` can not be used to set values. - MultiIndex Slicers is a generic way to get/set values on any level or - levels. It is a superset of xs functionality, see - :ref:`MultiIndex Slicers ` + MultiIndex Slicers is a generic way to get/set values on + any level or levels. + It is a superset of `xs` functionality, see + :ref:`MultiIndex Slicers `. + Examples + -------- + >>> d = {'num_legs': [4, 4, 2, 2], + ... 'num_wings': [0, 0, 2, 2], + ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], + ... 'animal': ['cat', 'dog', 'bat', 'penguin'], + ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} + >>> df = pd.DataFrame(data=d) + >>> df = df.set_index(['class', 'animal', 'locomotion']) + >>> df + num_legs num_wings + class animal locomotion + mammal cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + bird penguin walks 2 2 + + Get values at specified index + + >>> df.xs('mammal') + num_legs num_wings + animal locomotion + cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + + Get values at several indexes + + >>> df.xs(('mammal', 'dog')) + num_legs num_wings + locomotion + walks 4 0 + + Get values at specified index and level + + >>> df.xs('cat', level=1) + num_legs num_wings + class locomotion + mammal walks 4 0 + + Get values at several indexes and levels + + >>> df.xs(('bird', 'walks'), + ... level=[0, 'locomotion']) + num_legs num_wings + animal + penguin 2 2 + + Get values at specified column and axis + + >>> df.xs('num_wings', axis=1) + class animal locomotion + mammal cat walks 0 + dog walks 0 + bat flies 2 + bird penguin walks 2 + Name: num_wings, dtype: int64 """ axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -3282,7 +3596,8 @@ def xs(self, key, axis=0, level=None, drop_level=True): _xs = xs def select(self, crit, axis=0): - """Return data corresponding to axis labels matching criteria + """ + Return data corresponding to axis labels matching criteria .. deprecated:: 0.21.0 Use df.loc[df.index.map(crit)] to select via labels @@ -3316,29 +3631,99 @@ def select(self, crit, axis=0): def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): - """Return an object with matching indices to myself. + """ + Return an object with matching indices as other object. + + Conform the object to the same index on all axes. Optional + filling logic, placing NaN in locations having no value + in the previous index. A new object is produced unless the + new index is equivalent to the current one and copy=False. Parameters ---------- - other : Object - method : string or None - copy : boolean, default True + other : Object of the same data type + Its row and column indices are used to define the new indices + of this object. + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: propagate last valid observation forward to next + valid + * backfill / bfill: use next valid observation to fill gap + * nearest: use nearest valid observations to fill gap + + copy : bool, default True + Return a new object, even if the passed indexes are the same. limit : int, default None Maximum number of consecutive labels to fill for inexact matches. tolerance : optional - Maximum distance between labels of the other object and this - object for inexact matches. Can be list-like. + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. .. versionadded:: 0.21.0 (list-like tolerance) + Returns + ------- + Series or DataFrame + Same type as caller, but with changed indices on each axis. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex : Change to new indices or expand indices. + Notes ----- - Like calling s.reindex(index=other.index, columns=other.columns, - method=...) + Same as calling + ``.reindex(index=other.index, columns=other.columns,...)``. - Returns - ------- - reindexed : same as input + Examples + -------- + >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'], + ... [31, 87.8, 'high'], + ... [22, 71.6, 'medium'], + ... [35, 95, 'medium']], + ... columns=['temp_celsius', 'temp_fahrenheit', 'windspeed'], + ... index=pd.date_range(start='2014-02-12', + ... end='2014-02-15', freq='D')) + + >>> df1 + temp_celsius temp_fahrenheit windspeed + 2014-02-12 24.3 75.7 high + 2014-02-13 31.0 87.8 high + 2014-02-14 22.0 71.6 medium + 2014-02-15 35.0 95.0 medium + + >>> df2 = pd.DataFrame([[28, 'low'], + ... [30, 'low'], + ... [35.1, 'medium']], + ... columns=['temp_celsius', 'windspeed'], + ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', + ... '2014-02-15'])) + + >>> df2 + temp_celsius windspeed + 2014-02-12 28.0 low + 2014-02-13 30.0 low + 2014-02-15 35.1 medium + + >>> df2.reindex_like(df1) + temp_celsius temp_fahrenheit windspeed + 2014-02-12 28.0 NaN low + 2014-02-13 30.0 NaN low + 2014-02-14 NaN NaN NaN + 2014-02-15 35.1 NaN medium """ d = other._construct_axes_dict(axes=self._AXIS_ORDERS, method=method, copy=copy, limit=limit, @@ -3705,36 +4090,36 @@ def reindex(self, *args, **kwargs): Conform %(klass)s to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and - copy=False + ``copy=False``. Parameters ---------- %(optional_labels)s - %(axes)s : array-like, optional (should be specified using keywords) - New labels / index to conform to. Preferably an Index object to - avoid duplicating data + %(axes)s : array-like, optional + New labels / index to conform to, should be specified using + keywords. Preferably an Index object to avoid duplicating data %(optional_axis)s - method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}, optional - method to use for filling holes in reindexed DataFrame. + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. Please note: this is only applicable to DataFrames/Series with a monotonically increasing/decreasing index. - * default: don't fill gaps + * None (default): don't fill gaps * pad / ffill: propagate last valid observation forward to next valid * backfill / bfill: use next valid observation to fill gap * nearest: use nearest valid observations to fill gap - copy : boolean, default True - Return a new object, even if the passed indexes are the same + copy : bool, default True + Return a new object, even if the passed indexes are the same. level : int or name Broadcast across a level, matching Index values on the - passed MultiIndex level + passed MultiIndex level. fill_value : scalar, default np.NaN Value to use for missing values. Defaults to NaN, but can be any - "compatible" value + "compatible" value. limit : int, default None - Maximum number of consecutive elements to forward or backward fill + Maximum number of consecutive elements to forward or backward fill. tolerance : optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations most @@ -3748,6 +4133,12 @@ def reindex(self, *args, **kwargs): .. versionadded:: 0.21.0 (list-like tolerance) + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex_like : Change to same indices as other DataFrame. + Examples -------- @@ -3839,12 +4230,12 @@ def reindex(self, *args, **kwargs): ... index=date_index) >>> df2 prices - 2010-01-01 100 - 2010-01-02 101 + 2010-01-01 100.0 + 2010-01-02 101.0 2010-01-03 NaN - 2010-01-04 100 - 2010-01-05 89 - 2010-01-06 88 + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 Suppose we decide to expand the dataframe to cover a wider date range. @@ -3855,12 +4246,12 @@ def reindex(self, *args, **kwargs): 2009-12-29 NaN 2009-12-30 NaN 2009-12-31 NaN - 2010-01-01 100 - 2010-01-02 101 + 2010-01-01 100.0 + 2010-01-02 101.0 2010-01-03 NaN - 2010-01-04 100 - 2010-01-05 89 - 2010-01-06 88 + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 2010-01-07 NaN The index entries that did not have a value in the original data frame @@ -3873,15 +4264,15 @@ def reindex(self, *args, **kwargs): >>> df2.reindex(date_index2, method='bfill') prices - 2009-12-29 100 - 2009-12-30 100 - 2009-12-31 100 - 2010-01-01 100 - 2010-01-02 101 + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 2010-01-03 NaN - 2010-01-04 100 - 2010-01-05 89 - 2010-01-06 88 + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 2010-01-07 NaN Please note that the ``NaN`` value present in the original dataframe @@ -3895,7 +4286,7 @@ def reindex(self, *args, **kwargs): Returns ------- - reindexed : %(klass)s + %(klass)s with changed index. """ # TODO: Decide if we care about having different examples for different # kinds @@ -3967,11 +4358,11 @@ def _needs_reindex_multi(self, axes, method, level): def _reindex_multi(self, axes, copy, fill_value): return NotImplemented - _shared_docs[ - 'reindex_axis'] = ("""Conform input object to new index with optional - filling logic, placing NA/NaN in locations having no value in the - previous index. A new object is produced unless the new index is - equivalent to the current one and copy=False + _shared_docs['reindex_axis'] = (""" + Conform input object to new index + with optional filling logic, placing NA/NaN in locations having + no value in the previous index. A new object is produced unless + the new index is equivalent to the current one and copy=False. Parameters ---------- @@ -4008,17 +4399,20 @@ def _reindex_multi(self, axes, copy, fill_value): .. versionadded:: 0.21.0 (list-like tolerance) - Examples - -------- - >>> df.reindex_axis(['A', 'B', 'C'], axis=1) - See Also -------- - reindex, reindex_like + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. Returns ------- - reindexed : %(klass)s + %(klass)s + + Examples + -------- + >>> df.reindex_axis(['A', 'B', 'C'], axis=1) """) @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) @@ -4065,16 +4459,6 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, return self._constructor(new_data).__finalize__(self) - # TODO: unused; remove? - def _reindex_axis(self, new_index, fill_method, axis, copy): - new_data = self._data.reindex_axis(new_index, axis=axis, - method=fill_method, copy=copy) - - if new_data is self._data and not copy: - return self - else: - return self._constructor(new_data).__finalize__(self) - def filter(self, items=None, like=None, regex=None, axis=None): """ Subset rows or columns of dataframe according to labels in @@ -4124,7 +4508,7 @@ def filter(self, items=None, like=None, regex=None, axis=None): See Also -------- - pandas.DataFrame.loc + DataFrame.loc Notes ----- @@ -4183,7 +4567,7 @@ def head(self, n=5): See Also -------- - pandas.DataFrame.tail: Returns the last `n` rows. + DataFrame.tail: Returns the last `n` rows. Examples -------- @@ -4242,7 +4626,7 @@ def tail(self, n=5): See Also -------- - pandas.DataFrame.head : The first `n` rows of the caller object. + DataFrame.head : The first `n` rows of the caller object. Examples -------- @@ -4492,9 +4876,9 @@ def sample(self, n=None, frac=None, replace=False, weights=None, See Also -------- - pandas.DataFrame.apply - pandas.DataFrame.applymap - pandas.Series.map + DataFrame.apply + DataFrame.applymap + Series.map """) @Appender(_shared_docs['pipe'] % _shared_doc_kwargs) @@ -4682,6 +5066,14 @@ def __setattr__(self, name, value): stacklevel=2) object.__setattr__(self, name, value) + def _dir_additions(self): + """ add the string-like attributes from the info_axis. + If info_axis is a MultiIndex, it's first level values are used. + """ + additions = {c for c in self._info_axis.unique(level=0)[:100] + if isinstance(c, string_types) and isidentifier(c)} + return super(NDFrame, self)._dir_additions().union(additions) + # ---------------------------------------------------------------------- # Getting and setting elements @@ -4728,18 +5120,6 @@ def _consolidate(self, inplace=False): cons_data = self._protect_consolidate(f) return self._constructor(cons_data).__finalize__(self) - def consolidate(self, inplace=False): - """Compute NDFrame with "consolidated" internals (data of each dtype - grouped together in a single ndarray). - - .. deprecated:: 0.20.0 - Consolidate will be an internal implementation only. - """ - # 15483 - warnings.warn("consolidate is deprecated and will be removed in a " - "future release.", FutureWarning, stacklevel=2) - return self._consolidate(inplace) - @property def _is_mixed_type(self): f = lambda: self._data.is_mixed_type @@ -4784,14 +5164,15 @@ def _get_bool_data(self): # Internal Interface Methods def as_matrix(self, columns=None): - """Convert the frame to its Numpy-array representation. + """ + Convert the frame to its Numpy-array representation. .. deprecated:: 0.23.0 Use :meth:`DataFrame.values` instead. Parameters ---------- - columns: list, optional, default:None + columns : list, optional, default:None If None, return all columns, otherwise, returns specified columns. Returns @@ -4800,7 +5181,6 @@ def as_matrix(self, columns=None): If the caller is heterogeneous and contains booleans or objects, the result will be of dtype=object. See Notes. - Notes ----- Return is NOT a Numpy-matrix, rather, a Numpy-array. @@ -4820,7 +5200,7 @@ def as_matrix(self, columns=None): See Also -------- - pandas.DataFrame.values + DataFrame.values """ warnings.warn("Method .as_matrix will be removed in a future version. " "Use .values instead.", FutureWarning, stacklevel=2) @@ -4833,6 +5213,10 @@ def values(self): """ Return a Numpy representation of the DataFrame. + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + Only the values in the DataFrame will be returned, the axes labels will be removed. @@ -4894,8 +5278,9 @@ def values(self): See Also -------- - pandas.DataFrame.index : Retrieve the index labels - pandas.DataFrame.columns : Retrieving the column names + DataFrame.to_numpy : Recommended alternative to this method. + pandas.DataFrame.index : Retrieve the index labels. + pandas.DataFrame.columns : Retrieving the column names. """ self._consolidate_inplace() return self._data.as_array(transpose=self._AXIS_REVERSED) @@ -5017,7 +5402,7 @@ def get_ftype_counts(self): 1 b 2 2.0 2 c 3 3.0 - >>> df.get_ftype_counts() + >>> df.get_ftype_counts() # doctest: +SKIP float64:dense 1 int64:dense 1 object:dense 1 @@ -5047,7 +5432,7 @@ def dtypes(self): See Also -------- - pandas.DataFrame.ftypes : dtype and sparsity information. + pandas.DataFrame.ftypes : Dtype and sparsity information. Examples -------- @@ -5154,8 +5539,6 @@ def _to_dict_of_blocks(self, copy=True): return {k: self._constructor(v).__finalize__(self) for k, v, in self._data.to_dict(copy=copy).items()} - @deprecate_kwarg(old_arg_name='raise_on_error', new_arg_name='errors', - mapping={True: 'raise', False: 'ignore'}) def astype(self, dtype, copy=True, errors='raise', **kwargs): """ Cast a pandas object to a specified dtype ``dtype``. @@ -5167,11 +5550,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): the same type. Alternatively, use {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. - copy : bool, default True. + copy : bool, default True Return a copy when ``copy=True`` (be very careful setting ``copy=False`` as changes to values then may propagate to other pandas objects). - errors : {'raise', 'ignore'}, default 'raise'. + errors : {'raise', 'ignore'}, default 'raise' Control raising of exceptions on invalid data for provided dtype. - ``raise`` : allow exceptions to be raised @@ -5179,9 +5562,6 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): .. versionadded:: 0.20.0 - raise_on_error : raise on invalid input - .. deprecated:: 0.20.0 - Use ``errors`` instead kwargs : keyword arguments to pass on to the constructor Returns @@ -5210,7 +5590,9 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): Convert to ordered categorical type with custom ordering: - >>> ser.astype('category', ordered=True, categories=[2, 1]) + >>> cat_dtype = pd.api.types.CategoricalDtype( + ... categories=[2, 1], ordered=True) + >>> ser.astype(cat_dtype) 0 1 1 2 dtype: category @@ -5227,11 +5609,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): 1 2 dtype: int64 - See also + See Also -------- - pandas.to_datetime : Convert argument to datetime. - pandas.to_timedelta : Convert argument to timedelta. - pandas.to_numeric : Convert argument to a numeric type. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. numpy.ndarray.astype : Cast a numpy array to a specified type. """ if is_dict_like(dtype): @@ -5258,8 +5640,9 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): else: results.append(results.append(col.copy() if copy else col)) - elif is_categorical_dtype(dtype) and self.ndim > 1: + elif is_extension_array_dtype(dtype) and self.ndim > 1: # GH 18099: columnwise conversion to categorical + # and extension dtype results = (self[col].astype(dtype, copy=copy) for col in self) else: @@ -5428,7 +5811,8 @@ def _convert(self, datetime=False, numeric=False, timedelta=False, def convert_objects(self, convert_dates=True, convert_numeric=False, convert_timedeltas=True, copy=True): - """Attempt to infer better dtype for object columns. + """ + Attempt to infer better dtype for object columns. .. deprecated:: 0.21.0 @@ -5450,9 +5834,9 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, See Also -------- - pandas.to_datetime : Convert argument to datetime. - pandas.to_timedelta : Convert argument to timedelta. - pandas.to_numeric : Convert argument to numeric type. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to numeric type. Returns ------- @@ -5484,9 +5868,9 @@ def infer_objects(self): See Also -------- - pandas.to_datetime : Convert argument to datetime. - pandas.to_timedelta : Convert argument to timedelta. - pandas.to_numeric : Convert argument to numeric type. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to numeric type. Returns ------- @@ -5650,8 +6034,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, # fill in 2d chunks result = {col: s.fillna(method=method, value=value) for col, s in self.iteritems()} - new_obj = self._constructor.\ - from_dict(result).__finalize__(self) + prelim_obj = self._constructor.from_dict(result) + new_obj = prelim_obj.__finalize__(self) new_data = new_obj._data else: @@ -5789,7 +6173,7 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): value to use for each column (columns not in the dict will not be filled). Regular expressions, strings and lists or dicts of such objects are also allowed. - inplace : boolean, default False + inplace : bool, default False If True, in place. Note: this will modify any other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. @@ -5808,12 +6192,6 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): .. versionchanged:: 0.23.0 Added to DataFrame. - See Also - -------- - %(klass)s.fillna : Fill NA values - %(klass)s.where : Replace values based on boolean condition - Series.str.replace : Simple string replacement. - Returns ------- %(klass)s @@ -5837,6 +6215,12 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): * If a ``list`` or an ``ndarray`` is passed to `to_replace` and `value` but they are not the same length. + See Also + -------- + %(klass)s.fillna : Fill NA values. + %(klass)s.where : Replace values based on boolean condition. + Series.str.replace : Simple string replacement. + Notes ----- * Regex substitution is performed under the hood with ``re.sub``. The @@ -5951,7 +6335,7 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): 1 foo new 2 bait xyz - >>> df.replace(regex={r'^ba.$':'new', 'foo':'xyz'}) + >>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}) A B 0 new abc 1 xyz new @@ -6387,7 +6771,9 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, if _maybe_transposed_self._data.get_dtype_counts().get( 'object') == len(_maybe_transposed_self.T): - raise TypeError("Cannot interpolate with all NaNs.") + raise TypeError("Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype.") # create/use the index if method == 'linear': @@ -6423,40 +6809,98 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, def asof(self, where, subset=None): """ - The last row without any NaN is taken (or the last row without - NaN considering only the subset of columns in the case of a DataFrame) + Return the last row(s) without any NaNs before `where`. + + The last row (for each element in `where`, if list) without any + NaN is taken. + In case of a :class:`~pandas.DataFrame`, the last row without NaN + considering only the subset of columns (if not `None`) .. versionadded:: 0.19.0 For DataFrame - If there is no good value, NaN is returned for a Series + If there is no good value, NaN is returned for a Series or a Series of NaN values for a DataFrame Parameters ---------- - where : date or array of dates - subset : string or list of strings, default None - if not None use these columns for NaN propagation + where : date or array-like of dates + Date(s) before which the last row(s) are returned. + subset : str or array-like of str, default `None` + For DataFrame, if not `None`, only use these columns to + check for NaNs. Notes ----- - Dates are assumed to be sorted - Raises if this is not the case + Dates are assumed to be sorted. Raises if this is not the case. Returns ------- - where is scalar + scalar, Series, or DataFrame - - value or NaN if input is Series - - Series if input is DataFrame - - where is Index: same shape object as input + * scalar : when `self` is a Series and `where` is a scalar + * Series: when `self` is a Series and `where` is an array-like, + or when `self` is a DataFrame and `where` is a scalar + * DataFrame : when `self` is a DataFrame and `where` is an + array-like See Also -------- - merge_asof + merge_asof : Perform an asof merge. Similar to left join. - """ + Examples + -------- + A Series and a scalar `where`. + + >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40]) + >>> s + 10 1.0 + 20 2.0 + 30 NaN + 40 4.0 + dtype: float64 + + >>> s.asof(20) + 2.0 + + For a sequence `where`, a Series is returned. The first value is + NaN, because the first element of `where` is before the first + index value. + + >>> s.asof([5, 20]) + 5 NaN + 20 2.0 + dtype: float64 + + Missing values are not considered. The following is ``2.0``, not + NaN, even though NaN is at the index location for ``30``. + + >>> s.asof(30) + 2.0 + + Take all columns into consideration + + >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50], + ... 'b': [None, None, None, None, 500]}, + ... index=pd.DatetimeIndex(['2018-02-27 09:01:00', + ... '2018-02-27 09:02:00', + ... '2018-02-27 09:03:00', + ... '2018-02-27 09:04:00', + ... '2018-02-27 09:05:00'])) + >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', + ... '2018-02-27 09:04:30'])) + a b + 2018-02-27 09:03:30 NaN NaN + 2018-02-27 09:04:30 NaN NaN + + Take a single column into consideration + >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', + ... '2018-02-27 09:04:30']), + ... subset=['a']) + a b + 2018-02-27 09:03:30 30.0 NaN + 2018-02-27 09:04:30 40.0 NaN + """ if isinstance(where, compat.string_types): from pandas import to_datetime where = to_datetime(where) @@ -6550,10 +6994,10 @@ def asof(self, where, subset=None): See Also -------- - %(klass)s.isnull : alias of isna - %(klass)s.notna : boolean inverse of isna - %(klass)s.dropna : omit axes labels with missing values - isna : top-level isna + %(klass)s.isnull : Alias of isna. + %(klass)s.notna : Boolean inverse of isna. + %(klass)s.dropna : Omit axes labels with missing values. + isna : Top-level isna. Examples -------- @@ -6618,10 +7062,10 @@ def isnull(self): See Also -------- - %(klass)s.notnull : alias of notna - %(klass)s.isna : boolean inverse of notna - %(klass)s.dropna : omit axes labels with missing values - notna : top-level notna + %(klass)s.notnull : Alias of notna. + %(klass)s.isna : Boolean inverse of notna. + %(klass)s.dropna : Omit axes labels with missing values. + notna : Top-level notna. Examples -------- @@ -6871,12 +7315,12 @@ def clip_upper(self, threshold, axis=None, inplace=False): See Also -------- DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s) + given threshold(s). DataFrame.clip_lower : Trim DataFrame values below given - threshold(s) + threshold(s). Series.clip : General purpose method to trim Series values to given - threshold(s) - Series.clip_lower : Trim Series values below given threshold(s) + threshold(s). + Series.clip_lower : Trim Series values below given threshold(s). Examples -------- @@ -6949,12 +7393,12 @@ def clip_lower(self, threshold, axis=None, inplace=False): See Also -------- DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s) + given threshold(s). DataFrame.clip_upper : Trim DataFrame values above given - threshold(s) + threshold(s). Series.clip : General purpose method to trim Series values to given - threshold(s) - Series.clip_upper : Trim Series values above given threshold(s) + threshold(s). + Series.clip_upper : Trim Series values above given threshold(s). Examples -------- @@ -7170,7 +7614,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, For PeriodIndex only, see PeriodIndex.asfreq normalize : bool, default False Whether to reset output index to midnight - fill_value: scalar, optional + fill_value : scalar, optional Value to use for missing values, applied during upsampling (note this does not fill NaNs that already were present). @@ -7244,7 +7688,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, return asfreq(self, freq, method=method, how=how, normalize=normalize, fill_value=fill_value) - def at_time(self, time, asof=False): + def at_time(self, time, asof=False, axis=None): """ Select values at particular time of day (e.g. 9:30AM). @@ -7256,6 +7700,10 @@ def at_time(self, time, asof=False): Parameters ---------- time : datetime.time or string + axis : {0 or 'index', 1 or 'columns'}, default 0 + + .. versionadded:: 0.24.0 + Returns ------- @@ -7279,20 +7727,26 @@ def at_time(self, time, asof=False): See Also -------- - between_time : Select values between particular times of the day - first : Select initial periods of time series based on a date offset - last : Select final periods of time series based on a date offset + between_time : Select values between particular times of the day. + first : Select initial periods of time series based on a date offset. + last : Select final periods of time series based on a date offset. DatetimeIndex.indexer_at_time : Get just the index locations for - values at particular time of the day + values at particular time of the day. """ + if axis is None: + axis = self._stat_axis_number + axis = self._get_axis_number(axis) + + index = self._get_axis(axis) try: - indexer = self.index.indexer_at_time(time, asof=asof) - return self._take(indexer) + indexer = index.indexer_at_time(time, asof=asof) except AttributeError: raise TypeError('Index must be DatetimeIndex') + return self._take(indexer, axis=axis) + def between_time(self, start_time, end_time, include_start=True, - include_end=True): + include_end=True, axis=None): """ Select values between particular times of the day (e.g., 9:00-9:30 AM). @@ -7310,6 +7764,9 @@ def between_time(self, start_time, end_time, include_start=True, end_time : datetime.time or string include_start : boolean, default True include_end : boolean, default True + axis : {0 or 'index', 1 or 'columns'}, default 0 + + .. versionadded:: 0.24.0 Returns ------- @@ -7341,64 +7798,91 @@ def between_time(self, start_time, end_time, include_start=True, See Also -------- - at_time : Select values at a particular time of the day - first : Select initial periods of time series based on a date offset - last : Select final periods of time series based on a date offset + at_time : Select values at a particular time of the day. + first : Select initial periods of time series based on a date offset. + last : Select final periods of time series based on a date offset. DatetimeIndex.indexer_between_time : Get just the index locations for - values between particular times of the day + values between particular times of the day. """ + if axis is None: + axis = self._stat_axis_number + axis = self._get_axis_number(axis) + + index = self._get_axis(axis) try: - indexer = self.index.indexer_between_time( + indexer = index.indexer_between_time( start_time, end_time, include_start=include_start, include_end=include_end) - return self._take(indexer) except AttributeError: raise TypeError('Index must be DatetimeIndex') + return self._take(indexer, axis=axis) + def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, limit=None, base=0, on=None, level=None): """ + Resample time-series data. + Convenience method for frequency conversion and resampling of time - series. Object must have a datetime-like index (DatetimeIndex, - PeriodIndex, or TimedeltaIndex), or pass datetime-like values - to the on or level keyword. + series. Object must have a datetime-like index (`DatetimeIndex`, + `PeriodIndex`, or `TimedeltaIndex`), or pass datetime-like values + to the `on` or `level` keyword. Parameters ---------- - rule : string - the offset string or object representing target conversion - axis : int, optional, default 0 - closed : {'right', 'left'} + rule : str + The offset string or object representing target conversion. + how : str + Method for down/re-sampling, default to 'mean' for downsampling. + + .. deprecated:: 0.18.0 + The new syntax is ``.resample(...).mean()``, or + ``.resample(...).apply()`` + axis : {0 or 'index', 1 or 'columns'}, default 0 + Which axis to use for up- or down-sampling. For `Series` this + will default to 0, i.e. along the rows. Must be + `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`. + fill_method : str, default None + Filling method for upsampling. + + .. deprecated:: 0.18.0 + The new syntax is ``.resample(...).()``, + e.g. ``.resample(...).pad()`` + closed : {'right', 'left'}, default None Which side of bin interval is closed. The default is 'left' for all frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. - label : {'right', 'left'} + label : {'right', 'left'}, default None Which bin edge label to label bucket with. The default is 'left' for all frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. - convention : {'start', 'end', 's', 'e'} - For PeriodIndex only, controls whether to use the start or end of - `rule` - kind: {'timestamp', 'period'}, optional + convention : {'start', 'end', 's', 'e'}, default 'start' + For `PeriodIndex` only, controls whether to use the start or + end of `rule`. + kind : {'timestamp', 'period'}, optional, default None Pass 'timestamp' to convert the resulting index to a - ``DateTimeIndex`` or 'period' to convert it to a ``PeriodIndex``. + `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`. By default the input representation is retained. - loffset : timedelta - Adjust the resampled time labels + loffset : timedelta, default None + Adjust the resampled time labels. + limit : int, default None + Maximum size gap when reindexing with `fill_method`. + + .. deprecated:: 0.18.0 base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could - range from 0 through 4. Defaults to 0 - on : string, optional + range from 0 through 4. Defaults to 0. + on : str, optional For a DataFrame, column to use instead of index for resampling. Column must be datetime-like. .. versionadded:: 0.19.0 - level : string or int, optional + level : str or int, optional For a MultiIndex, level (name or number) to use for - resampling. Level must be datetime-like. + resampling. `level` must be datetime-like. .. versionadded:: 0.19.0 @@ -7415,6 +7899,12 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, To learn more about the offset strings, please see `this link `__. + See Also + -------- + groupby : Group by mapping, function, label, or list of labels. + Series.resample : Resample a Series. + DataFrame.resample: Resample a DataFrame. + Examples -------- @@ -7471,7 +7961,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, Upsample the series into 30 second bins. - >>> series.resample('30S').asfreq()[0:5] #select first 5 rows + >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1.0 @@ -7504,8 +7994,8 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, Pass a custom function via ``apply`` >>> def custom_resampler(array_like): - ... return np.sum(array_like)+5 - + ... return np.sum(array_like) + 5 + ... >>> series.resample('3T').apply(custom_resampler) 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 @@ -7515,73 +8005,106 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, For a Series with a PeriodIndex, the keyword `convention` can be used to control whether to use the start or end of `rule`. + Resample a year by quarter using 'start' `convention`. Values are + assigned to the first quarter of the period. + >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01', - freq='A', - periods=2)) + ... freq='A', + ... periods=2)) >>> s 2012 1 2013 2 Freq: A-DEC, dtype: int64 - - Resample by month using 'start' `convention`. Values are assigned to - the first month of the period. - - >>> s.resample('M', convention='start').asfreq().head() - 2012-01 1.0 - 2012-02 NaN - 2012-03 NaN - 2012-04 NaN - 2012-05 NaN - Freq: M, dtype: float64 - - Resample by month using 'end' `convention`. Values are assigned to - the last month of the period. - - >>> s.resample('M', convention='end').asfreq() - 2012-12 1.0 - 2013-01 NaN - 2013-02 NaN - 2013-03 NaN - 2013-04 NaN - 2013-05 NaN - 2013-06 NaN - 2013-07 NaN - 2013-08 NaN - 2013-09 NaN - 2013-10 NaN - 2013-11 NaN - 2013-12 2.0 + >>> s.resample('Q', convention='start').asfreq() + 2012Q1 1.0 + 2012Q2 NaN + 2012Q3 NaN + 2012Q4 NaN + 2013Q1 2.0 + 2013Q2 NaN + 2013Q3 NaN + 2013Q4 NaN + Freq: Q-DEC, dtype: float64 + + Resample quarters by month using 'end' `convention`. Values are + assigned to the last month of the period. + + >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01', + ... freq='Q', + ... periods=4)) + >>> q + 2018Q1 1 + 2018Q2 2 + 2018Q3 3 + 2018Q4 4 + Freq: Q-DEC, dtype: int64 + >>> q.resample('M', convention='end').asfreq() + 2018-03 1.0 + 2018-04 NaN + 2018-05 NaN + 2018-06 2.0 + 2018-07 NaN + 2018-08 NaN + 2018-09 3.0 + 2018-10 NaN + 2018-11 NaN + 2018-12 4.0 Freq: M, dtype: float64 - For DataFrame objects, the keyword ``on`` can be used to specify the + For DataFrame objects, the keyword `on` can be used to specify the column instead of the index for resampling. - >>> df = pd.DataFrame(data=9*[range(4)], columns=['a', 'b', 'c', 'd']) - >>> df['time'] = pd.date_range('1/1/2000', periods=9, freq='T') - >>> df.resample('3T', on='time').sum() - a b c d - time - 2000-01-01 00:00:00 0 3 6 9 - 2000-01-01 00:03:00 0 3 6 9 - 2000-01-01 00:06:00 0 3 6 9 - - For a DataFrame with MultiIndex, the keyword ``level`` can be used to - specify on level the resampling needs to take place. - - >>> time = pd.date_range('1/1/2000', periods=5, freq='T') - >>> df2 = pd.DataFrame(data=10*[range(4)], - columns=['a', 'b', 'c', 'd'], - index=pd.MultiIndex.from_product([time, [1, 2]]) - ) - >>> df2.resample('3T', level=0).sum() - a b c d - 2000-01-01 00:00:00 0 6 12 18 - 2000-01-01 00:03:00 0 4 8 12 - - See also - -------- - groupby : Group by mapping, function, label, or list of labels. + >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) + >>> df = pd.DataFrame(d) + >>> df['week_starting'] = pd.date_range('01/01/2018', + ... periods=8, + ... freq='W') + >>> df + price volume week_starting + 0 10 50 2018-01-07 + 1 11 60 2018-01-14 + 2 9 40 2018-01-21 + 3 13 100 2018-01-28 + 4 14 50 2018-02-04 + 5 18 100 2018-02-11 + 6 17 40 2018-02-18 + 7 19 50 2018-02-25 + >>> df.resample('M', on='week_starting').mean() + price volume + week_starting + 2018-01-31 10.75 62.5 + 2018-02-28 17.00 60.0 + + For a DataFrame with MultiIndex, the keyword `level` can be used to + specify on which level the resampling needs to take place. + + >>> days = pd.date_range('1/1/2000', periods=4, freq='D') + >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) + >>> df2 = pd.DataFrame(d2, + ... index=pd.MultiIndex.from_product([days, + ... ['morning', + ... 'afternoon']] + ... )) + >>> df2 + price volume + 2000-01-01 morning 10 50 + afternoon 11 60 + 2000-01-02 morning 9 40 + afternoon 13 100 + 2000-01-03 morning 14 50 + afternoon 18 100 + 2000-01-04 morning 17 40 + afternoon 19 50 + >>> df2.resample('D', level=0).sum() + price volume + 2000-01-01 21 110 + 2000-01-02 22 140 + 2000-01-03 32 150 + 2000-01-04 36 90 """ + from pandas.core.resample import (resample, _maybe_process_deprecations) axis = self._get_axis_number(axis) @@ -7636,9 +8159,9 @@ def first(self, offset): See Also -------- - last : Select final periods of time series based on a date offset - at_time : Select values at a particular time of the day - between_time : Select values between particular times of the day + last : Select final periods of time series based on a date offset. + at_time : Select values at a particular time of the day. + between_time : Select values between particular times of the day. """ if not isinstance(self.index, DatetimeIndex): raise TypeError("'first' only supports a DatetimeIndex index") @@ -7699,9 +8222,9 @@ def last(self, offset): See Also -------- - first : Select initial periods of time series based on a date offset - at_time : Select values at a particular time of the day - between_time : Select values between particular times of the day + first : Select initial periods of time series based on a date offset. + at_time : Select values at a particular time of the day. + between_time : Select values between particular times of the day. """ if not isinstance(self.index, DatetimeIndex): raise TypeError("'last' only supports a DatetimeIndex index") @@ -8010,7 +8533,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, # This is a single-dimensional object. if not is_bool_dtype(cond): raise ValueError(msg.format(dtype=cond.dtype)) - else: + elif not cond.empty: for dt in cond.dtypes: if not is_bool_dtype(dt): raise ValueError(msg.format(dtype=dt)) @@ -8185,7 +8708,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, See Also -------- :func:`DataFrame.%(name_other)s` : Return an object of same shape as - self + self. Examples -------- @@ -8289,31 +8812,59 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, errors=errors) _shared_docs['shift'] = (""" - Shift index by desired number of periods with an optional time freq + Shift index by desired number of periods with an optional time `freq`. + + When `freq` is not passed, shift the index without realigning the data. + If `freq` is passed (in this case, the index must be date or datetime, + or it will raise a `NotImplementedError`), the index will be + increased using the periods and the `freq`. Parameters ---------- periods : int - Number of periods to move, can be positive or negative. - freq : DateOffset, timedelta, or time rule string, optional - Increment to use from the tseries module or time rule (e.g. 'EOM'). - See Notes. - axis : %(axes_single_arg)s + Number of periods to shift. Can be positive or negative. + freq : DateOffset, tseries.offsets, timedelta, or str, optional + Offset to use from the tseries module or time rule (e.g. 'EOM'). + If `freq` is specified then the index values are shifted but the + data is not realigned. That is, use `freq` if you would like to + extend the index when shifting and preserve the original data. + axis : {0 or 'index', 1 or 'columns', None}, default None + Shift direction. + + Returns + ------- + %(klass)s + Copy of input object, shifted. See Also -------- Index.shift : Shift values of Index. DatetimeIndex.shift : Shift values of DatetimeIndex. + PeriodIndex.shift : Shift values of PeriodIndex. + tshift : Shift the time index, using the index's frequency if + available. - Notes - ----- - If freq is specified then the index values are shifted but the data - is not realigned. That is, use freq if you would like to extend the - index when shifting and preserve the original data. - - Returns - ------- - shifted : %(klass)s + Examples + -------- + >>> df = pd.DataFrame({'Col1': [10, 20, 15, 30, 45], + ... 'Col2': [13, 23, 18, 33, 48], + ... 'Col3': [17, 27, 22, 37, 52]}) + + >>> df.shift(periods=3) + Col1 Col2 Col3 + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 10.0 13.0 17.0 + 4 20.0 23.0 27.0 + + >>> df.shift(periods=1, axis='columns') + Col1 Col2 Col3 + 0 NaN 10.0 13.0 + 1 NaN 20.0 23.0 + 2 NaN 15.0 18.0 + 3 NaN 30.0 33.0 + 4 NaN 45.0 48.0 """) @Appender(_shared_docs['shift'] % _shared_doc_kwargs) @@ -8629,7 +9180,7 @@ def _tz_convert(ax, tz): return result.__finalize__(self) def tz_localize(self, tz, axis=0, level=None, copy=True, - ambiguous='raise'): + ambiguous='raise', nonexistent='raise'): """ Localize tz-naive TimeSeries to target time zone. @@ -8643,6 +9194,13 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, copy : boolean, default True Also make a copy of the underlying data ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + - 'infer' will attempt to infer fall dst-transition hours based on order - bool-ndarray where True signifies a DST time, False designates @@ -8651,6 +9209,17 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, - 'NaT' will return NaT where there are ambiguous times - 'raise' will raise an AmbiguousTimeError if there are ambiguous times + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent times forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 Returns ------- @@ -8659,11 +9228,60 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, ------ TypeError If the TimeSeries is tz-aware and tz is not None. + + Examples + -------- + + Localize local times: + + >>> s = pd.Series([1], + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00'])) + >>> s.tz_localize('CET') + 2018-09-15 01:30:00+02:00 1 + dtype: int64 + + Be careful with DST changes. When there is sequential data, pandas + can infer the DST time: + + >>> s = pd.Series(range(7), index=pd.DatetimeIndex([ + ... '2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) + >>> s.tz_localize('CET', ambiguous='infer') + 2018-10-28 01:30:00+02:00 0 + 2018-10-28 02:00:00+02:00 1 + 2018-10-28 02:30:00+02:00 2 + 2018-10-28 02:00:00+01:00 3 + 2018-10-28 02:30:00+01:00 4 + 2018-10-28 03:00:00+01:00 5 + 2018-10-28 03:30:00+01:00 6 + dtype: int64 + + In some cases, inferring the DST is impossible. In such cases, you can + pass an ndarray to the ambiguous parameter to set the DST explicitly + + >>> s = pd.Series(range(3), index=pd.DatetimeIndex([ + ... '2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) + >>> s.tz_localize('CET', ambiguous=np.array([True, True, False])) + 2018-10-28 01:20:00+02:00 0 + 2018-10-28 02:36:00+02:00 1 + 2018-10-28 03:46:00+01:00 2 + dtype: int64 """ + if nonexistent not in ('raise', 'NaT', 'shift'): + raise ValueError("The nonexistent argument must be one of 'raise'," + " 'NaT' or 'shift'") + axis = self._get_axis_number(axis) ax = self._get_axis(axis) - def _tz_localize(ax, tz, ambiguous): + def _tz_localize(ax, tz, ambiguous, nonexistent): if not hasattr(ax, 'tz_localize'): if len(ax) > 0: ax_name = self._get_axis_name(axis) @@ -8672,19 +9290,23 @@ def _tz_localize(ax, tz, ambiguous): else: ax = DatetimeIndex([], tz=tz) else: - ax = ax.tz_localize(tz, ambiguous=ambiguous) + ax = ax.tz_localize( + tz, ambiguous=ambiguous, nonexistent=nonexistent + ) return ax # if a level is given it must be a MultiIndex level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - new_level = _tz_localize(ax.levels[level], tz, ambiguous) + new_level = _tz_localize( + ax.levels[level], tz, ambiguous, nonexistent + ) ax = ax.set_levels(new_level, level=level) else: if level not in (None, 0, ax.name): raise ValueError("The level {0} is not valid".format(level)) - ax = _tz_localize(ax, tz, ambiguous) + ax = _tz_localize(ax, tz, ambiguous, nonexistent) result = self._constructor(self._data, copy=copy) result.set_axis(ax, axis=axis, inplace=True) @@ -8757,7 +9379,7 @@ def abs(self): See Also -------- - numpy.absolute : calculate the absolute value element-wise. + numpy.absolute : Calculate the absolute value element-wise. """ return np.abs(self) @@ -9040,10 +9662,16 @@ def describe_categorical_1d(data): if is_datetime64_any_dtype(data): tz = data.dt.tz asint = data.dropna().values.view('i8') + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) names += ['top', 'freq', 'first', 'last'] - result += [tslib.Timestamp(top, tz=tz), freq, - tslib.Timestamp(asint.min(), tz=tz), - tslib.Timestamp(asint.max(), tz=tz)] + result += [top, freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz)] else: names += ['top', 'freq'] result += [top, freq] @@ -9089,7 +9717,9 @@ def describe_1d(data): return d def _check_percentile(self, q): - """Validate percentiles (used by describe and quantile).""" + """ + Validate percentiles (used by describe and quantile). + """ msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") @@ -9248,7 +9878,9 @@ def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): @classmethod def _add_numeric_operations(cls): - """Add the operations to the cls; evaluate the doc strings again""" + """ + Add the operations to the cls; evaluate the doc strings again + """ axis_descr, name, name2 = _doc_parms(cls) @@ -9347,7 +9979,7 @@ def compound(self, axis=None, skipna=None, level=None): cls, 'kurt', name, name2, axis_descr, "Return unbiased kurtosis over requested axis using Fisher's " "definition of\nkurtosis (kurtosis of normal == 0.0). Normalized " - "by N-1\n", + "by N-1", nanops.nankurt) cls.kurtosis = cls.kurt cls.prod = _make_min_count_stat_function( @@ -9364,7 +9996,7 @@ def compound(self, axis=None, skipna=None, level=None): """This method returns the maximum of the values in the object. If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``.""", - nanops.nanmax) + nanops.nanmax, _max_examples) cls.min = _make_stat_function( cls, 'min', name, name2, axis_descr, """This method returns the minimum of the values in the object. @@ -9374,7 +10006,8 @@ def compound(self, axis=None, skipna=None, level=None): @classmethod def _add_series_only_operations(cls): - """Add the series only operations to the cls; evaluate the doc + """ + Add the series only operations to the cls; evaluate the doc strings again. """ @@ -9390,19 +10023,18 @@ def nanptp(values, axis=0, skipna=True): cls.ptp = _make_stat_function( cls, 'ptp', name, name2, axis_descr, - """ - Returns the difference between the maximum value and the + """Returns the difference between the maximum value and the minimum value in the object. This is the equivalent of the ``numpy.ndarray`` method ``ptp``. .. deprecated:: 0.24.0 - Use numpy.ptp instead - """, + Use numpy.ptp instead""", nanptp) @classmethod def _add_series_or_dataframe_operations(cls): - """Add the series or dataframe only operations to the cls; evaluate + """ + Add the series or dataframe only operations to the cls; evaluate the doc strings again. """ @@ -9464,7 +10096,8 @@ def transform(self, func, *args, **kwargs): """ def _find_valid_index(self, how): - """Retrieves the index of the first valid value. + """ + Retrieves the index of the first valid value. Parameters ---------- @@ -9507,145 +10140,6 @@ def first_valid_index(self): def last_valid_index(self): return self._find_valid_index('last') - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, - columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression='infer', quoting=None, - quotechar='"', line_terminator='\n', chunksize=None, - tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal='.'): - r""" - Write object to a comma-separated values (csv) file. - - .. versionchanged:: 0.24.0 - The order of arguments for Series was changed. - - Parameters - ---------- - path_or_buf : str or file handle, default None - File path or object, if None is provided the result is returned as - a string. - - .. versionchanged:: 0.24.0 - - Was previously named "path" for Series. - - sep : str, default ',' - String of length 1. Field delimiter for the output file. - na_rep : str, default '' - Missing data representation. - float_format : str, default None - Format string for floating point numbers. - columns : sequence, optional - Columns to write. - header : bool or list of str, default True - Write out the column names. If a list of strings is given it is - assumed to be aliases for the column names. - - .. versionchanged:: 0.24.0 - - Previously defaulted to False for Series. - - index : bool, default True - Write row names (index). - index_label : str or sequence, or False, default None - Column label for index column(s) if desired. If None is given, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the object uses MultiIndex. If - False do not print fields for index names. Use index_label=False - for easier importing in R. - mode : str - Python write mode, default 'w'. - encoding : str, optional - A string representing the encoding to use in the output file, - defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. - - quoting : optional constant from csv module - Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` - then floats are converted to strings and thus csv.QUOTE_NONNUMERIC - will treat them as non-numeric. - quotechar : str, default '\"' - String of length 1. Character used to quote fields. - line_terminator : string, default ``'\n'`` - The newline character or character sequence to use in the output - file. - chunksize : int or None - Rows to write at a time. - tupleize_cols : bool, default False - Write MultiIndex columns as a list of tuples (if True) or in - the new, expanded format, where each MultiIndex column is a row - in the CSV (if False). - - .. deprecated:: 0.21.0 - This argument will be removed and will always write each row - of the multi-index as a separate row in the CSV file. - date_format : str, default None - Format string for datetime objects. - doublequote : bool, default True - Control quoting of `quotechar` inside a field. - escapechar : str, default None - String of length 1. Character used to escape `sep` and `quotechar` - when appropriate. - decimal : str, default '.' - Character recognized as decimal separator. E.g. use ',' for - European data. - - Returns - ------- - None or str - If path_or_buf is None, returns the resulting csv format as a - string. Otherwise returns None. - - See Also - -------- - pandas.read_csv : Load a CSV file into a DataFrame. - pandas.to_excel : Load an Excel file into a DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], - ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}) - >>> df.to_csv(index=False) - 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' - """ - - df = self if isinstance(self, ABCDataFrame) else self.to_frame() - - if tupleize_cols is not None: - warnings.warn("The 'tupleize_cols' parameter is deprecated and " - "will be removed in a future version", - FutureWarning, stacklevel=2) - else: - tupleize_cols = False - - from pandas.io.formats.csvs import CSVFormatter - formatter = CSVFormatter(df, path_or_buf, - line_terminator=line_terminator, sep=sep, - encoding=encoding, - compression=compression, quoting=quoting, - na_rep=na_rep, float_format=float_format, - cols=columns, header=header, index=index, - index_label=index_label, mode=mode, - chunksize=chunksize, quotechar=quotechar, - tupleize_cols=tupleize_cols, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, decimal=decimal) - formatter.save() - - if path_or_buf is None: - return formatter.path_or_buf.getvalue() - def _doc_parms(cls): """Return a tuple of the doc parms.""" @@ -9658,7 +10152,6 @@ def _doc_parms(cls): _num_doc = """ - %(desc)s Parameters @@ -9676,12 +10169,11 @@ def _doc_parms(cls): Returns ------- -%(outname)s : %(name1)s or %(name2)s (if level specified) +%(outname)s : %(name1)s or %(name2)s (if level specified)\ %(examples)s""" _num_ddof_doc = """ - %(desc)s Parameters @@ -9785,10 +10277,10 @@ def _doc_parms(cls): """ _all_see_also = """\ -See also +See Also -------- -pandas.Series.all : Return True if all elements are True -pandas.DataFrame.any : Return True if one (or more) elements are True +Series.all : Return True if all elements are True. +DataFrame.any : Return True if one (or more) elements are True. """ _cnum_doc = """ @@ -9812,9 +10304,9 @@ def _doc_parms(cls): ------- %(outname)s : %(name1)s or %(name2)s\n %(examples)s -See also +See Also -------- -pandas.core.window.Expanding.%(accum_func_name)s : Similar functionality +core.window.Expanding.%(accum_func_name)s : Similar functionality but ignores ``NaN`` values. %(name2)s.%(accum_func_name)s : Return the %(desc)s over %(name2)s axis. @@ -10165,6 +10657,40 @@ def _doc_parms(cls): _sum_examples = """\ Examples -------- +``MultiIndex`` series example of monthly rainfall + +>>> index = pd.MultiIndex.from_product( +... [['London', 'New York'], ['Jun', 'Jul', 'Aug']], +... names=['city', 'month']) +>>> s = pd.Series([47, 35, 54, 112, 117, 113], index=index) +>>> s +city month +London Jun 47 + Jul 35 + Aug 54 +New York Jun 112 + Jul 117 + Aug 113 +dtype: int64 + +>>> s.sum() +478 + +Sum using level names, as well as indices + +>>> s.sum(level='city') +city +London 136 +New York 342 +dtype: int64 + +>>> s.sum(level=1) +month +Jun 159 +Jul 152 +Aug 167 +dtype: int64 + By default, the sum of an empty or all-NA Series is ``0``. >>> pd.Series([]).sum() # min_count=0 is the default @@ -10209,6 +10735,44 @@ def _doc_parms(cls): nan """ +_max_examples = """\ +Examples +-------- +``MultiIndex`` series example of monthly rainfall + +>>> index = pd.MultiIndex.from_product( +... [['London', 'New York'], ['Jun', 'Jul', 'Aug']], +... names=['city', 'month']) +>>> s = pd.Series([47, 35, 54, 112, 117, 113], index=index) +>>> s +city month +London Jun 47 + Jul 35 + Aug 54 +New York Jun 112 + Jul 117 + Aug 113 +dtype: int64 + +>>> s.max() +117 + +Max using level names, as well as indices + +>>> s.max(level='city') +city +London 54 +New York 117 +dtype: int64 + +>>> s.max(level=1) +month +Jun 112 +Jul 117 +Aug 113 +dtype: int64 +""" + _min_count_stub = """\ min_count : int, default 0 @@ -10246,9 +10810,10 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, return set_function_name(stat_func, name, cls) -def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): +def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f, + examples=''): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, min_count='', examples='') + axis_descr=axis_descr, min_count='', examples=examples) @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): @@ -10305,7 +10870,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): issubclass(y.dtype.type, (np.datetime64, np.timedelta64))): result = accum_func(y, axis) mask = isna(self) - np.putmask(result, mask, tslib.iNaT) + np.putmask(result, mask, iNaT) elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): mask = isna(self) np.putmask(y, mask, mask_a) diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 6f90fd1cff7e6..9c15a5ebfe0f2 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,4 @@ -from pandas.core.groupby.groupby import GroupBy # flake8: noqa -from pandas.core.groupby.generic import ( - SeriesGroupBy, DataFrameGroupBy, PanelGroupBy) # flake8: noqa -from pandas.core.groupby.grouper import Grouper # flake8: noqa +from pandas.core.groupby.groupby import GroupBy # noqa: F401 +from pandas.core.groupby.generic import ( # noqa: F401 + SeriesGroupBy, DataFrameGroupBy, PanelGroupBy) +from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index ac84971de08d8..a148f7e0cab87 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -5,16 +5,22 @@ """ import types + from pandas.util._decorators import make_signature -from pandas.core.dtypes.common import is_scalar, is_list_like + +from pandas.core.dtypes.common import is_list_like, is_scalar class GroupByMixin(object): - """ provide the groupby facilities to the mixed object """ + """ + Provide the groupby facilities to the mixed object. + """ @staticmethod def _dispatch(name, *args, **kwargs): - """ dispatch to apply """ + """ + Dispatch to apply. + """ def outer(self, *args, **kwargs): def f(x): @@ -26,8 +32,7 @@ def f(x): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index e54045884ea93..85f51323a97b5 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,7 +1,8 @@ import numpy as np + from pandas.core.algorithms import unique1d from pandas.core.arrays.categorical import ( - _recode_for_categories, CategoricalDtype, Categorical) + Categorical, CategoricalDtype, _recode_for_categories) def recode_for_groupby(c, sort, observed): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f15b1203a334e..26e437355fa8b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -7,46 +7,40 @@ """ import collections -import warnings import copy -from textwrap import dedent from functools import partial +from textwrap import dedent +import warnings import numpy as np -from pandas._libs import lib, Timestamp -from pandas.util._decorators import Substitution, Appender -from pandas import compat - -import pandas.core.indexes.base as ibase -import pandas.core.common as com -from pandas.core.panel import Panel +from pandas._libs import Timestamp, lib +import pandas.compat as compat from pandas.compat import lzip, map +from pandas.compat.numpy import _np_version_under1p13 +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution -from pandas.core.series import Series -from pandas.core.generic import _shared_docs -from pandas.core.groupby.groupby import ( - GroupBy, _apply_docs, _transform_template) -from pandas.core.generic import NDFrame -from pandas.core.groupby import base +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - is_scalar, - is_bool, - is_datetimelike, - is_numeric_dtype, - is_integer_dtype, - is_interval_dtype, - ensure_platform_int, - ensure_int64) + ensure_int64, ensure_platform_int, is_bool, is_datetimelike, + is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna + import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical +from pandas.core.base import DataError, SpecificationError +import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.dtypes.cast import maybe_downcast_to_dtype -from pandas.core.base import SpecificationError, DataError -from pandas.core.index import Index, MultiIndex, CategoricalIndex -from pandas.core.arrays.categorical import Categorical +from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby import base +from pandas.core.groupby.groupby import ( + GroupBy, _apply_docs, _transform_template) +from pandas.core.index import CategoricalIndex, Index, MultiIndex +import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block -from pandas.compat.numpy import _np_version_under1p13 +from pandas.core.panel import Panel +from pandas.core.series import Series from pandas.plotting._core import boxplot_frame_groupby @@ -247,7 +241,7 @@ def _aggregate_generic(self, func, *args, **kwargs): return self._wrap_generic_output(result, obj) def _wrap_aggregated_output(self, output, names=None): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _aggregate_item_by_item(self, func, *args, **kwargs): # only for axis==0 @@ -417,7 +411,9 @@ def first_not_none(values): if (isinstance(v.index, MultiIndex) or key_index is None or isinstance(key_index, MultiIndex)): - stacked_values = np.vstack(map(np.asarray, values)) + stacked_values = np.vstack([ + np.asarray(v) for v in values + ]) result = DataFrame(stacked_values, index=key_index, columns=index) else: @@ -429,7 +425,8 @@ def first_not_none(values): axis=self.axis).unstack() result.columns = index else: - stacked_values = np.vstack(map(np.asarray, values)) + stacked_values = np.vstack([np.asarray(v) + for v in values]) result = DataFrame(stacked_values.T, index=v.index, columns=key_index) @@ -590,14 +587,17 @@ def _choose_path(self, fast_path, slow_path, group): try: res_fast = fast_path(group) - # compare that we get the same results + # verify fast path does not change columns (and names), otherwise + # its results cannot be joined with those of the slow path + if res_fast.columns != group.columns: + return path, res + # verify numerical equality with the slow path if res.shape == res_fast.shape: res_r = res.values.ravel() res_fast_r = res_fast.values.ravel() mask = notna(res_r) - if (res_r[mask] == res_fast_r[mask]).all(): - path = fast_path - + if (res_r[mask] == res_fast_r[mask]).all(): + path = fast_path except Exception: pass return path, res @@ -734,7 +734,7 @@ def _selection_name(self): 1 1 2 2 3 4 - See also + See Also -------- pandas.Series.groupby.apply pandas.Series.groupby.transform @@ -826,8 +826,9 @@ def _aggregate_multiple_funcs(self, arg, _level): for name, func in arg: obj = self if name in results: - raise SpecificationError('Function names must be unique, ' - 'found multiple named %s' % name) + raise SpecificationError( + 'Function names must be unique, found multiple named ' + '{}'.format(name)) # reset the cache so that we # only include the named selection @@ -1027,8 +1028,8 @@ def nunique(self, dropna=True): try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes - assert val.dtype == object, \ - 'val.dtype must be object, got %s' % val.dtype + msg = 'val.dtype must be object, got {}'.format(val.dtype) + assert val.dtype == object, msg val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) _isna = lambda a: a == -1 @@ -1111,7 +1112,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, lab = cut(Series(val), bins, include_lowest=True) lev = lab.cat.categories lab = lev.take(lab.cat.codes) - llab = lambda lab, inc: lab[inc]._multiindex.labels[-1] + llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] if is_interval_dtype(lab): # TODO: should we do this inside II? @@ -1162,7 +1163,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, labels[-1] = out[sorter], labels[-1][sorter] if bins is None: - mi = MultiIndex(levels=levels, labels=labels, names=names, + mi = MultiIndex(levels=levels, codes=labels, names=names, verify_integrity=False) if is_integer_dtype(out): @@ -1190,10 +1191,10 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, left[-1] = out[sorter], left[-1][sorter] # build the multi-index w/ full levels - labels = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) - labels.append(left[-1]) + codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) + codes.append(left[-1]) - mi = MultiIndex(levels=levels, labels=labels, names=names, + mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) if is_integer_dtype(out): @@ -1288,12 +1289,11 @@ class DataFrameGroupBy(NDFrameGroupBy): 1 1 2 0.590716 2 3 4 0.704907 - See also + See Also -------- pandas.DataFrame.groupby.apply pandas.DataFrame.groupby.transform pandas.DataFrame.aggregate - """) @Appender(_agg_doc) @@ -1659,4 +1659,4 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): raise ValueError("axis value must be greater than 0") def _wrap_aggregated_output(self, output, names=None): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 61dadd833be35..253860d83f49e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -7,46 +7,42 @@ class providing the base-class of operations. expose these user-facing objects to provide specific functionailty. """ -import types -from functools import wraps, partial -import datetime import collections -import warnings from contextlib import contextmanager +import datetime +from functools import partial, wraps +import types +import warnings import numpy as np -from pandas._libs import groupby as libgroupby, Timestamp -from pandas.util._validators import validate_kwargs -from pandas.util._decorators import ( - cache_readonly, Substitution, Appender) - -from pandas import compat -from pandas.compat import zip, range, callable, set_function_name +from pandas._libs import Timestamp, groupby as libgroupby +import pandas.compat as compat +from pandas.compat import callable, range, set_function_name, zip from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._validators import validate_kwargs -from pandas.core.dtypes.common import ( - is_numeric_dtype, - is_scalar, - ensure_float) from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import ( + ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna +import pandas.core.algorithms as algorithms +from pandas.core.base import ( + DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) +import pandas.core.common as com +from pandas.core.config import option_context +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, - DataError, SpecificationError) from pandas.core.index import Index, MultiIndex -from pandas.core.generic import NDFrame -from pandas.core.frame import DataFrame from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter -import pandas.core.common as com -import pandas.core.algorithms as algorithms -from pandas.core.config import option_context _doc_template = """ - - See also + See Also -------- pandas.Series.%(name)s pandas.DataFrame.%(name)s @@ -94,7 +90,7 @@ class providing the base-class of operations. -------- {examples} - See also + See Also -------- pipe : Apply function to the full GroupBy object instead of to each group. @@ -219,8 +215,8 @@ class providing the base-class of operations. See Also -------- -pandas.Series.pipe : Apply a function with arguments to a series -pandas.DataFrame.pipe: Apply a function with arguments to a dataframe +pandas.Series.pipe : Apply a function with arguments to a series. +pandas.DataFrame.pipe: Apply a function with arguments to a dataframe. apply : Apply function to each group instead of to the full %(klass)s object. """ @@ -256,7 +252,7 @@ class providing the base-class of operations. ------- %(klass)s -See also +See Also -------- aggregate, transform @@ -295,7 +291,7 @@ class providing the base-class of operations. class GroupByPlot(PandasObject): """ - Class implementing the .plot attribute for groupby objects + Class implementing the .plot attribute for groupby objects. """ def __init__(self, groupby): @@ -318,7 +314,7 @@ def f(self): @contextmanager def _group_selection_context(groupby): """ - set / reset the _group_selection_context + Set / reset the _group_selection_context. """ groupby._set_group_selection() yield groupby @@ -327,7 +323,7 @@ def _group_selection_context(groupby): class _GroupBy(PandasObject, SelectionMixin): _group_selection = None - _apply_whitelist = frozenset([]) + _apply_whitelist = frozenset() def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, @@ -381,14 +377,16 @@ def __unicode__(self): def _assure_grouper(self): """ - we create the grouper on instantiation - sub-classes may have a different policy + We create the grouper on instantiation sub-classes may have a + different policy. """ pass @property def groups(self): - """ dict {group name -> group labels} """ + """ + Dict {group name -> group labels}. + """ self._assure_grouper() return self.grouper.groups @@ -399,14 +397,16 @@ def ngroups(self): @property def indices(self): - """ dict {group name -> group indices} """ + """ + Dict {group name -> group indices}. + """ self._assure_grouper() return self.grouper.indices def _get_indices(self, names): """ - safe get multiple indices, translate keys for - datelike to underlying repr + Safe get multiple indices, translate keys for + datelike to underlying repr. """ def get_converter(s): @@ -454,7 +454,9 @@ def get_converter(s): return [self.indices.get(name, []) for name in names] def _get_index(self, name): - """ safe get index, translate keys for datelike to underlying repr """ + """ + Safe get index, translate keys for datelike to underlying repr. + """ return self._get_indices([name])[0] @cache_readonly @@ -469,8 +471,10 @@ def _selected_obj(self): def _reset_group_selection(self): """ - Clear group based selection. Used for methods needing to return info on - each group regardless of whether a group selection was previously set. + Clear group based selection. + + Used for methods needing to return info on each group regardless of + whether a group selection was previously set. """ if self._group_selection is not None: # GH12839 clear cached selection too when changing group selection @@ -479,8 +483,9 @@ def _reset_group_selection(self): def _set_group_selection(self): """ - Create group based selection. Used when selection is not passed - directly but instead via a grouper. + Create group based selection. + + Used when selection is not passed directly but instead via a grouper. NOTE: this should be paired with a call to _reset_group_selection """ @@ -497,7 +502,8 @@ def _set_group_selection(self): if len(groupers): # GH12839 clear selected obj cache when group selection changes - self._group_selection = ax.difference(Index(groupers)).tolist() + self._group_selection = ax.difference(Index(groupers), + sort=False).tolist() self._reset_cache('_selected_obj') def _set_result_index_ordered(self, result): @@ -578,8 +584,8 @@ def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis # argument kwargs_with_axis = kwargs.copy() - if 'axis' not in kwargs_with_axis or \ - kwargs_with_axis['axis'] is None: + if ('axis' not in kwargs_with_axis or + kwargs_with_axis['axis'] is None): kwargs_with_axis['axis'] = self.axis def curried_with_axis(x): @@ -620,7 +626,7 @@ def curried(x): def get_group(self, name, obj=None): """ - Constructs NDFrame from group with provided name + Constructs NDFrame from group with provided name. Parameters ---------- @@ -646,7 +652,7 @@ def get_group(self, name, obj=None): def __iter__(self): """ - Groupby iterator + Groupby iterator. Returns ------- @@ -710,7 +716,7 @@ def _iterate_slices(self): yield self._selection_name, self._selected_obj def transform(self, func, *args, **kwargs): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _cumcount_array(self, ascending=True): """ @@ -746,11 +752,11 @@ def _cumcount_array(self, ascending=True): def _try_cast(self, result, obj, numeric_only=False): """ - try to cast the result to our obj original type, - we may have roundtripped thru object in the mean-time + Try to cast the result to our obj original type, + we may have roundtripped through object in the mean-time. - if numeric_only is True, then only try to cast numerics - and not datetimelikes + If numeric_only is True, then only try to cast numerics + and not datetimelikes. """ if obj.ndim > 1: @@ -759,7 +765,18 @@ def _try_cast(self, result, obj, numeric_only=False): dtype = obj.dtype if not is_scalar(result): - if numeric_only and is_numeric_dtype(dtype) or not numeric_only: + if is_extension_array_dtype(dtype): + # The function can return something of any type, so check + # if the type is compatible with the calling EA. + try: + result = obj.values._from_sequence(result) + except Exception: + # https://github.com/pandas-dev/pandas/issues/22850 + # pandas has no control over what 3rd-party ExtensionArrays + # do in _values_from_sequence. We still want ops to work + # though, so we catch any regular Exception. + pass + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) return result @@ -854,7 +871,7 @@ def _python_agg_general(self, func, *args, **kwargs): return self._wrap_aggregated_output(output) def _wrap_applied_output(self, *args, **kwargs): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _concat_objects(self, keys, values, not_indexed_same=False): from pandas.core.reshape.concat import concat @@ -937,8 +954,9 @@ def _apply_filter(self, indices, dropna): class GroupBy(_GroupBy): """ - Class for grouping and aggregating relational data. See aggregate, - transform, and apply functions on this object. + Class for grouping and aggregating relational data. + + See aggregate, transform, and apply functions on this object. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: @@ -1002,7 +1020,9 @@ class GroupBy(_GroupBy): Number of groups """ def _bool_agg(self, val_test, skipna): - """Shared func to call any / all Cython GroupBy implementations""" + """ + Shared func to call any / all Cython GroupBy implementations. + """ def objs_to_bool(vals): try: @@ -1028,7 +1048,7 @@ def result_to_bool(result): @Appender(_doc_template) def any(self, skipna=True): """ - Returns True if any value in the group is truthful, else False + Returns True if any value in the group is truthful, else False. Parameters ---------- @@ -1040,7 +1060,8 @@ def any(self, skipna=True): @Substitution(name='groupby') @Appender(_doc_template) def all(self, skipna=True): - """Returns True if all values in the group are truthful, else False + """ + Returns True if all values in the group are truthful, else False. Parameters ---------- @@ -1052,7 +1073,9 @@ def all(self, skipna=True): @Substitution(name='groupby') @Appender(_doc_template) def count(self): - """Compute count of group, excluding missing values""" + """ + Compute count of group, excluding missing values. + """ # defined here for API doc raise NotImplementedError @@ -1118,7 +1141,7 @@ def mean(self, *args, **kwargs): @Appender(_doc_template) def median(self, **kwargs): """ - Compute median of groups, excluding missing values + Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex """ @@ -1139,9 +1162,9 @@ def f(x): @Appender(_doc_template) def std(self, ddof=1, *args, **kwargs): """ - Compute standard deviation of groups, excluding missing values + Compute standard deviation of groups, excluding missing values. - For multiple groupings, the result index will be a MultiIndex + For multiple groupings, the result index will be a MultiIndex. Parameters ---------- @@ -1157,9 +1180,9 @@ def std(self, ddof=1, *args, **kwargs): @Appender(_doc_template) def var(self, ddof=1, *args, **kwargs): """ - Compute variance of groups, excluding missing values + Compute variance of groups, excluding missing values. - For multiple groupings, the result index will be a MultiIndex + For multiple groupings, the result index will be a MultiIndex. Parameters ---------- @@ -1183,9 +1206,9 @@ def var(self, ddof=1, *args, **kwargs): @Appender(_doc_template) def sem(self, ddof=1): """ - Compute standard error of the mean of groups, excluding missing values + Compute standard error of the mean of groups, excluding missing values. - For multiple groupings, the result index will be a MultiIndex + For multiple groupings, the result index will be a MultiIndex. Parameters ---------- @@ -1198,7 +1221,9 @@ def sem(self, ddof=1): @Substitution(name='groupby') @Appender(_doc_template) def size(self): - """Compute group sizes""" + """ + Compute group sizes. + """ result = self.grouper.size() if isinstance(self.obj, Series): @@ -1207,7 +1232,9 @@ def size(self): @classmethod def _add_numeric_operations(cls): - """ add numeric operations to the GroupBy generically """ + """ + Add numeric operations to the GroupBy generically. + """ def groupby_function(name, alias, npfunc, numeric_only=True, _convert=False, @@ -1284,7 +1311,8 @@ def last(x): @Appender(_doc_template) def ohlc(self): """ - Compute sum of values, excluding missing values + Compute sum of values, excluding missing values. + For multiple groupings, the result index will be a MultiIndex """ @@ -1299,12 +1327,111 @@ def describe(self, **kwargs): return result.T return result.unstack() - @Substitution(name='groupby') - @Appender(_doc_template) def resample(self, rule, *args, **kwargs): """ - Provide resampling when using a TimeGrouper - Return a new grouper with our resampler appended + Provide resampling when using a TimeGrouper. + + Given a grouper, the function resamples it according to a string + "string" -> "frequency". + + See the :ref:`frequency aliases ` + documentation for more details. + + Parameters + ---------- + rule : str or DateOffset + The offset string or object representing target grouper conversion. + *args, **kwargs + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + + Returns + ------- + Grouper + Return a new grouper with our resampler appended. + + See Also + -------- + pandas.Grouper : Specify a frequency to resample with when + grouping by a key. + DatetimeIndex.resample : Frequency conversion and resampling of + time series. + + Examples + -------- + >>> idx = pd.date_range('1/1/2000', periods=4, freq='T') + >>> df = pd.DataFrame(data=4 * [range(2)], + ... index=idx, + ... columns=['a', 'b']) + >>> df.iloc[2, 0] = 5 + >>> df + a b + 2000-01-01 00:00:00 0 1 + 2000-01-01 00:01:00 0 1 + 2000-01-01 00:02:00 5 1 + 2000-01-01 00:03:00 0 1 + + Downsample the DataFrame into 3 minute bins and sum the values of + the timestamps falling into a bin. + + >>> df.groupby('a').resample('3T').sum() + a b + a + 0 2000-01-01 00:00:00 0 2 + 2000-01-01 00:03:00 0 1 + 5 2000-01-01 00:00:00 5 1 + + Upsample the series into 30 second bins. + + >>> df.groupby('a').resample('30S').sum() + a b + a + 0 2000-01-01 00:00:00 0 1 + 2000-01-01 00:00:30 0 0 + 2000-01-01 00:01:00 0 1 + 2000-01-01 00:01:30 0 0 + 2000-01-01 00:02:00 0 0 + 2000-01-01 00:02:30 0 0 + 2000-01-01 00:03:00 0 1 + 5 2000-01-01 00:02:00 5 1 + + Resample by month. Values are assigned to the month of the period. + + >>> df.groupby('a').resample('M').sum() + a b + a + 0 2000-01-31 0 3 + 5 2000-01-31 5 1 + + Downsample the series into 3 minute bins as above, but close the right + side of the bin interval. + + >>> df.groupby('a').resample('3T', closed='right').sum() + a b + a + 0 1999-12-31 23:57:00 0 1 + 2000-01-01 00:00:00 0 2 + 5 2000-01-01 00:00:00 5 1 + + Downsample the series into 3 minute bins and close the right side of + the bin interval, but label each bin using the right edge instead of + the left. + + >>> df.groupby('a').resample('3T', closed='right', label='right').sum() + a b + a + 0 2000-01-01 00:00:00 0 1 + 2000-01-01 00:03:00 0 2 + 5 2000-01-01 00:03:00 5 1 + + Add an offset of twenty seconds. + + >>> df.groupby('a').resample('3T', loffset='20s').sum() + a b + a + 0 2000-01-01 00:00:20 0 2 + 2000-01-01 00:03:20 0 1 + 5 2000-01-01 00:00:20 5 1 """ from pandas.core.resample import get_resampler_for_grouping return get_resampler_for_grouping(self, rule, *args, **kwargs) @@ -1313,9 +1440,7 @@ def resample(self, rule, *args, **kwargs): @Appender(_doc_template) def rolling(self, *args, **kwargs): """ - Return a rolling grouper, providing rolling - functionality per group - + Return a rolling grouper, providing rolling functionality per group. """ from pandas.core.window import RollingGroupby return RollingGroupby(self, *args, **kwargs) @@ -1325,14 +1450,14 @@ def rolling(self, *args, **kwargs): def expanding(self, *args, **kwargs): """ Return an expanding grouper, providing expanding - functionality per group - + functionality per group. """ from pandas.core.window import ExpandingGroupby return ExpandingGroupby(self, *args, **kwargs) def _fill(self, direction, limit=None): - """Shared function for `pad` and `backfill` to call Cython method + """ + Shared function for `pad` and `backfill` to call Cython method. Parameters ---------- @@ -1366,7 +1491,7 @@ def _fill(self, direction, limit=None): @Substitution(name='groupby') def pad(self, limit=None): """ - Forward fill the values + Forward fill the values. Parameters ---------- @@ -1386,7 +1511,7 @@ def pad(self, limit=None): @Substitution(name='groupby') def backfill(self, limit=None): """ - Backward fill the values + Backward fill the values. Parameters ---------- @@ -1490,8 +1615,10 @@ def nth(self, n, dropna=None): self._set_group_selection() if not dropna: - mask = np.in1d(self._cumcount_array(), nth_values) | \ - np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values) + mask_left = np.in1d(self._cumcount_array(), nth_values) + mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, + -nth_values) + mask = mask_left | mask_right out = self._selected_obj[mask] if not self.as_index: @@ -1517,7 +1644,8 @@ def nth(self, n, dropna=None): # just returns NaN raise ValueError("For a DataFrame groupby, dropna must be " "either None, 'any' or 'all', " - "(was passed %s)." % (dropna),) + "(was passed {dropna}).".format( + dropna=dropna)) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf @@ -1552,8 +1680,8 @@ def nth(self, n, dropna=None): result.loc[mask] = np.nan # reset/reindex to the original groups - if len(self.obj) == len(dropped) or \ - len(result) == len(self.grouper.result_index): + if (len(self.obj) == len(dropped) or + len(result) == len(self.grouper.result_index)): result.index = self.grouper.result_index else: result = result.reindex(self.grouper.result_index) @@ -1614,7 +1742,7 @@ def ngroup(self, ascending=True): 5 0 dtype: int64 - See also + See Also -------- .cumcount : Number the rows in each group. """ @@ -1670,7 +1798,7 @@ def cumcount(self, ascending=True): 5 0 dtype: int64 - See also + See Also -------- .ngroup : Number the groups themselves. """ @@ -1720,7 +1848,9 @@ def rank(self, method='average', ascending=True, na_option='keep', @Substitution(name='groupby') @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): - """Cumulative product for each group""" + """ + Cumulative product for each group. + """ nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only', 'skipna']) if axis != 0: @@ -1731,7 +1861,9 @@ def cumprod(self, axis=0, *args, **kwargs): @Substitution(name='groupby') @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): - """Cumulative sum for each group""" + """ + Cumulative sum for each group. + """ nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only', 'skipna']) if axis != 0: @@ -1742,7 +1874,9 @@ def cumsum(self, axis=0, *args, **kwargs): @Substitution(name='groupby') @Appender(_doc_template) def cummin(self, axis=0, **kwargs): - """Cumulative min for each group""" + """ + Cumulative min for each group. + """ if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) @@ -1751,7 +1885,9 @@ def cummin(self, axis=0, **kwargs): @Substitution(name='groupby') @Appender(_doc_template) def cummax(self, axis=0, **kwargs): - """Cumulative max for each group""" + """ + Cumulative max for each group. + """ if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) @@ -1763,7 +1899,8 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, result_is_index=False, pre_processing=None, post_processing=None, **kwargs): - """Get result for Cythonized functions + """ + Get result for Cythonized functions. Parameters ---------- @@ -1858,7 +1995,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): """ - Shift each group by periods observations + Shift each group by periods observations. Parameters ---------- @@ -1881,7 +2018,9 @@ def shift(self, periods=1, freq=None, axis=0): @Appender(_doc_template) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, axis=0): - """Calculate pct_change of each value to previous entry in group""" + """ + Calculate pct_change of each value to previous entry in group. + """ if freq is not None or axis != 0: return self.apply(lambda x: x.pct_change(periods=periods, fill_method=fill_method, @@ -1925,7 +2064,7 @@ def head(self, n=5): @Appender(_doc_template) def tail(self, n=5): """ - Returns last n rows of each group + Returns last n rows of each group. Essentially equivalent to ``.apply(lambda x: x.tail(n))``, except ignores as_index flag. @@ -1961,6 +2100,6 @@ def groupby(obj, by, **kwds): from pandas.core.groupby.generic import DataFrameGroupBy klass = DataFrameGroupBy else: # pragma: no cover - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError('invalid type: {}'.format(obj)) return klass(obj, by, **kwds) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e7144fb1d2932..d8df227d4911a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -4,30 +4,26 @@ """ import warnings + import numpy as np +import pandas.compat as compat +from pandas.compat import callable, zip from pandas.util._decorators import cache_readonly -from pandas import compat -from pandas.compat import zip, callable - -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.arrays import ExtensionArray, Categorical -from pandas.core.index import ( - Index, MultiIndex, CategoricalIndex) from pandas.core.dtypes.common import ( - ensure_categorical, - is_hashable, - is_list_like, - is_timedelta64_dtype, - is_datetime64_dtype, - is_categorical_dtype, - is_scalar) -from pandas.core.series import Series -from pandas.core.frame import DataFrame + ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable, + is_list_like, is_scalar, is_timedelta64_dtype) +from pandas.core.dtypes.generic import ABCSeries + +import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com +from pandas.core.frame import DataFrame from pandas.core.groupby.ops import BaseGrouper -import pandas.core.algorithms as algorithms +from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.series import Series + from pandas.io.formats.printing import pprint_thing @@ -157,8 +153,8 @@ def _set_grouper(self, obj, sort=False): if self.key is not None: key = self.key # The 'on' is already defined - if getattr(self.grouper, 'name', None) == key and \ - isinstance(obj, ABCSeries): + if (getattr(self.grouper, 'name', None) == key and + isinstance(obj, ABCSeries)): ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: @@ -261,7 +257,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if level is not None: if not isinstance(level, int): if level not in index.names: - raise AssertionError('Level %s not in index' % str(level)) + raise AssertionError('Level {} not in index'.format(level)) level = index.names.index(level) if self.name is None: @@ -321,7 +317,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) - raise ValueError("Grouper for '%s' not 1-dimensional" % t) + raise ValueError( + "Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): @@ -464,8 +461,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, if isinstance(level, compat.string_types): if obj.index.name != level: - raise ValueError('level name %s is not the name of the ' - 'index' % level) + raise ValueError('level name {} is not the name of the ' + 'index'.format(level)) elif level > 0 or level < -1: raise ValueError('level > 0 or level < -1 only valid with ' ' MultiIndex') @@ -530,9 +527,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, except Exception: all_in_columns_index = False - if not any_callable and not all_in_columns_index and \ - not any_arraylike and not any_groupers and \ - match_axis_length and level is None: + if (not any_callable and not all_in_columns_index and + not any_arraylike and not any_groupers and + match_axis_length and level is None): keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): @@ -593,15 +590,15 @@ def is_in_obj(gpr): # create the Grouping # allow us to passing the actual Grouping as the gpr - ping = Grouping(group_axis, - gpr, - obj=obj, - name=name, - level=level, - sort=sort, - observed=observed, - in_axis=in_axis) \ - if not isinstance(gpr, Grouping) else gpr + ping = (Grouping(group_axis, + gpr, + obj=obj, + name=name, + level=level, + sort=sort, + observed=observed, + in_axis=in_axis) + if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d9f7b4d9c31c3..87f48d5a40554 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -6,42 +6,33 @@ are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. """ -import copy import collections + import numpy as np -from pandas._libs import lib, reduction, NaT, iNaT, groupby as libgroupby +from pandas._libs import NaT, groupby as libgroupby, iNaT, lib, reduction +from pandas.compat import lzip, range, zip +from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.compat import zip, range, lzip +from pandas.core.dtypes.common import ( + ensure_float64, ensure_int64, ensure_int64_or_float64, ensure_object, + ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, + is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, + is_timedelta64_dtype, needs_i8_conversion) +from pandas.core.dtypes.missing import _maybe_fill, isna +import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin -from pandas.core.dtypes.missing import isna, _maybe_fill -from pandas.core.index import ( - Index, MultiIndex, ensure_index) -from pandas.core.dtypes.common import ( - ensure_float64, - ensure_platform_int, - ensure_int64, - ensure_int64_or_float64, - ensure_object, - needs_i8_conversion, - is_integer_dtype, - is_complex_dtype, - is_bool_dtype, - is_numeric_dtype, - is_timedelta64_dtype, - is_datetime64_any_dtype, - is_categorical_dtype) -from pandas.core.series import Series +import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -import pandas.core.common as com from pandas.core.groupby import base -from pandas.core.sorting import (get_group_index_sorter, get_group_index, - compress_group_index, get_flattened_iterator, - decons_obs_group_ids, get_indexer_dict) -import pandas.core.algorithms as algorithms +from pandas.core.index import Index, MultiIndex, ensure_index +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, decons_obs_group_ids, get_flattened_iterator, + get_group_index, get_group_index_sorter, get_indexer_dict) def generate_bins_generic(values, binner, closed): @@ -299,10 +290,10 @@ def result_index(self): if not self.compressed and len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) - labels = self.recons_labels + codes = self.recons_labels levels = [ping.result_index for ping in self.groupings] result = MultiIndex(levels=levels, - labels=labels, + codes=codes, verify_integrity=False, names=self.names) return result @@ -388,7 +379,8 @@ def get_func(fname): # otherwise find dtype-specific version, falling back to object for dt in [dtype_str, 'object']: - f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None) + f = getattr(libgroupby, "{fname}_{dtype_str}".format( + fname=fname, dtype_str=dtype_str), None) if f is not None: return f @@ -411,9 +403,11 @@ def wrapper(*args, **kwargs): func = get_func(ftype) if func is None: - raise NotImplementedError("function is not implemented for this" - "dtype: [how->%s,dtype->%s]" % - (how, dtype_str)) + raise NotImplementedError( + "function is not implemented for this dtype: " + "[how->{how},dtype->{dtype_str}]".format(how=how, + dtype_str=dtype_str)) + return func def _cython_operation(self, kind, values, how, axis, min_count=-1, @@ -493,7 +487,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, out_dtype = 'float' else: if is_numeric: - out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + out_dtype = '{kind}{itemsize}'.format( + kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = 'object' @@ -521,8 +516,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, result = result.astype('float64') result[mask] = np.nan - if kind == 'aggregate' and \ - self._filter_empty_groups and not counts.all(): + if (kind == 'aggregate' and + self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset( @@ -683,10 +678,8 @@ def groups(self): # this is mainly for compat # GH 3881 - result = {} - for key, value in zip(self.binlabels, self.bins): - if key is not NaT: - result[key] = value + result = {key: value for key, value in zip(self.binlabels, self.bins) + if key is not NaT} return result @property @@ -743,12 +736,9 @@ def group_info(self): else: comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) - return comp_ids.astype('int64', copy=False), \ - obs_group_ids.astype('int64', copy=False), ngroups - - @cache_readonly - def ngroups(self): - return len(self.result_index) + return (comp_ids.astype('int64', copy=False), + obs_group_ids.astype('int64', copy=False), + ngroups) @cache_readonly def result_index(self): @@ -776,11 +766,6 @@ def agg_series(self, obj, func): grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() - # ---------------------------------------------------------------------- - # cython aggregation - - _cython_functions = copy.deepcopy(BaseGrouper._cython_functions) - def _get_axes(group): if isinstance(group, Series): @@ -849,7 +834,7 @@ def _chop(self, sdata, slice_obj): return sdata.iloc[slice_obj] def apply(self, f): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): @@ -860,9 +845,6 @@ def _chop(self, sdata, slice_obj): class FrameSplitter(DataSplitter): - def __init__(self, data, labels, ngroups, axis=0): - super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis) - def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool try: diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index a1868980faed3..6138f73726e0a 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -1,23 +1,20 @@ """ datetimelike delegation """ - import numpy as np -from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( - is_period_arraylike, - is_datetime_arraylike, is_integer_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_categorical_dtype, - is_list_like) + is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_datetime_arraylike, is_integer_dtype, is_list_like, is_period_arraylike, + is_timedelta64_dtype) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.accessor import PandasDelegate, delegate_names +from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin, PandasObject from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.period import PeriodArray from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.algorithms import take_1d class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): @@ -46,7 +43,8 @@ def _get_values(self): else: if is_period_arraylike(data): - return PeriodIndex(data, copy=False, name=self.name) + # TODO: use to_period_array + return PeriodArray(data, copy=False) if is_datetime_arraylike(data): return DatetimeIndex(data, copy=False, name=self.name) @@ -270,11 +268,11 @@ def freq(self): return self._get_values().inferred_freq -@delegate_names(delegate=PeriodIndex, - accessors=PeriodIndex._datetimelike_ops, +@delegate_names(delegate=PeriodArray, + accessors=PeriodArray._datetimelike_ops, typ="property") -@delegate_names(delegate=PeriodIndex, - accessors=PeriodIndex._datetimelike_methods, +@delegate_names(delegate=PeriodArray, + accessors=PeriodArray._datetimelike_methods, typ="method") class PeriodProperties(Properties): """ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index e50a4b099a8e1..6299fc482d0df 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,23 +1,21 @@ import textwrap import warnings -from pandas.core.indexes.base import (Index, - _new_Index, - ensure_index, - ensure_index_from_sequences, - InvalidIndexError) # noqa -from pandas.core.indexes.category import CategoricalIndex # noqa -from pandas.core.indexes.multi import MultiIndex # noqa -from pandas.core.indexes.interval import IntervalIndex # noqa -from pandas.core.indexes.numeric import (NumericIndex, Float64Index, # noqa - Int64Index, UInt64Index) -from pandas.core.indexes.range import RangeIndex # noqa -from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.datetimes import DatetimeIndex +from pandas._libs import NaT, lib import pandas.core.common as com -from pandas._libs import lib, NaT +from pandas.core.indexes.base import ( + Index, _new_Index, ensure_index, ensure_index_from_sequences) +from pandas.core.indexes.base import InvalidIndexError # noqa:F401 +from pandas.core.indexes.category import CategoricalIndex # noqa:F401 +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.interval import IntervalIndex # noqa:F401 +from pandas.core.indexes.multi import MultiIndex # noqa:F401 +from pandas.core.indexes.numeric import ( # noqa:F401 + Float64Index, Int64Index, NumericIndex, UInt64Index) +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.range import RangeIndex # noqa:F401 +from pandas.core.indexes.timedeltas import TimedeltaIndex _sort_msg = textwrap.dedent("""\ Sorting because non-concatenation axis is not aligned. A future version @@ -44,18 +42,69 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): - # Extract combined index: return intersection or union (depending on the - # value of "intersect") of indexes on given axis, or None if all objects - # lack indexes (e.g. they are numpy arrays) + """ + Extract combined index: return intersection or union (depending on the + value of "intersect") of indexes on given axis, or None if all objects + lack indexes (e.g. they are numpy arrays). + + Parameters + ---------- + objs : list of objects + Each object will only be considered if it has a _get_axis + attribute. + intersect : bool, default False + If True, calculate the intersection between indexes. Otherwise, + calculate the union. + axis : {0 or 'index', 1 or 'outer'}, default 0 + The axis to extract indexes from. + sort : bool, default True + Whether the result index should come out sorted or not. + + Returns + ------- + Index + """ obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, '_get_axis')] if obs_idxes: return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) +def _get_distinct_objs(objs): + """ + Return a list with distinct elements of "objs" (different ids). + Preserves order. + """ + ids = set() + res = [] + for obj in objs: + if not id(obj) in ids: + ids.add(id(obj)) + res.append(obj) + return res + + def _get_combined_index(indexes, intersect=False, sort=False): + """ + Return the union or intersection of indexes. + + Parameters + ---------- + indexes : list of Index or list objects + When intersect=True, do not accept list of lists. + intersect : bool, default False + If True, calculate the intersection between indexes. Otherwise, + calculate the union. + sort : bool, default False + Whether the result index should come out sorted or not. + + Returns + ------- + Index + """ + # TODO: handle index names! - indexes = com.get_distinct_objs(indexes) + indexes = _get_distinct_objs(indexes) if len(indexes) == 0: index = Index([]) elif len(indexes) == 1: @@ -77,6 +126,21 @@ def _get_combined_index(indexes, intersect=False, sort=False): def _union_indexes(indexes, sort=True): + """ + Return the union of indexes. + + The behavior of sort and names is not consistent. + + Parameters + ---------- + indexes : list of Index or list objects + sort : bool, default True + Whether the result index should come out sorted or not. + + Returns + ------- + Index + """ if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') if len(indexes) == 1: @@ -88,6 +152,19 @@ def _union_indexes(indexes, sort=True): indexes, kind = _sanitize_and_check(indexes) def _unique_indices(inds): + """ + Convert indexes to lists and concatenate them, removing duplicates. + + The final dtype is inferred. + + Parameters + ---------- + inds : list of Index or list objects + + Returns + ------- + Index + """ def conv(i): if isinstance(i, Index): i = i.tolist() @@ -126,6 +203,26 @@ def conv(i): def _sanitize_and_check(indexes): + """ + Verify the type of indexes and convert lists to Index. + + Cases: + + - [list, list, ...]: Return ([list, list, ...], 'list') + - [list, Index, ...]: Return _sanitize_and_check([Index, Index, ...]) + Lists are sorted and converted to Index. + - [Index, Index, ...]: Return ([Index, Index, ...], TYPE) + TYPE = 'special' if at least one special type, 'array' otherwise. + + Parameters + ---------- + indexes : list of Index or list objects + + Returns + ------- + sanitized_indexes : list of Index or list objects + type : {'list', 'array', 'special'} + """ kinds = list({type(index) for index in indexes}) if list in kinds: @@ -144,6 +241,21 @@ def _sanitize_and_check(indexes): def _get_consensus_names(indexes): + """ + Give a consensus 'names' to indexes. + + If there's exactly one non-empty 'names', return this, + otherwise, return empty. + + Parameters + ---------- + indexes : list of Index objects + + Returns + ------- + list + A list representing the consensus 'names' found. + """ # find the non-none names, need to tupleify to make # the set hashable, then reverse on return @@ -155,6 +267,18 @@ def _get_consensus_names(indexes): def _all_indexes_same(indexes): + """ + Determine if all indexes contain the same elements. + + Parameters + ---------- + indexes : list of Index objects + + Returns + ------- + bool + True if all indexes contain the same elements, False otherwise. + """ first = indexes[0] for index in indexes[1:]: if not first.equals(index): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 51c84d6e28cb4..88510e84a29a5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,69 +1,50 @@ from datetime import datetime, timedelta -import warnings import operator from textwrap import dedent +import warnings import numpy as np -from pandas._libs import (lib, index as libindex, tslibs, - algos as libalgos, join as libjoin, - Timedelta) -from pandas._libs.lib import is_datetime_array -from pandas.compat import range, u, set_function_name +from pandas._libs import ( + Timedelta, algos as libalgos, index as libindex, join as libjoin, lib, + tslibs) +from pandas._libs.lib import is_datetime_array +import pandas.compat as compat +from pandas.compat import range, set_function_name, u from pandas.compat.numpy import function as nv -from pandas import compat +from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray -from pandas.core.dtypes.generic import ( - ABCSeries, ABCDataFrame, - ABCMultiIndex, - ABCPeriodIndex, ABCTimedeltaIndex, - ABCDateOffset) -from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( - ensure_int64, - ensure_object, - ensure_categorical, - ensure_platform_int, - is_integer, - is_float, - is_dtype_equal, - is_dtype_union_equal, - is_object_dtype, - is_categorical, - is_categorical_dtype, - is_interval_dtype, - is_period_dtype, - is_bool, - is_bool_dtype, - is_signed_integer_dtype, - is_unsigned_integer_dtype, - is_integer_dtype, is_float_dtype, - is_datetime64_any_dtype, - is_datetime64tz_dtype, - is_timedelta64_dtype, - is_extension_array_dtype, - is_hashable, - is_iterator, is_list_like, - is_scalar) - -from pandas.core.base import PandasObject, IndexOpsMixin -import pandas.core.common as com + ensure_categorical, ensure_int64, ensure_object, ensure_platform_int, + is_bool, is_bool_dtype, is_categorical, is_categorical_dtype, + is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal, + is_dtype_union_equal, is_extension_array_dtype, is_float, is_float_dtype, + is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator, + is_list_like, is_object_dtype, is_period_dtype, is_scalar, + is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype) +import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, + ABCMultiIndex, ABCPeriodIndex, ABCSeries, ABCTimedeltaArray, + ABCTimedeltaIndex) +from pandas.core.dtypes.missing import array_equivalent, isna + from pandas.core import ops -from pandas.util._decorators import ( - Appender, Substitution, cache_readonly) +from pandas.core.accessor import CachedAccessor +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray +from pandas.core.base import IndexOpsMixin, PandasObject +import pandas.core.common as com from pandas.core.indexes.frozen import FrozenList -import pandas.core.dtypes.concat as _concat import pandas.core.missing as missing -import pandas.core.algorithms as algos +from pandas.core.ops import get_op_result_name, make_invalid_op import pandas.core.sorting as sorting -from pandas.io.formats.printing import ( - pprint_thing, default_pprint, format_object_summary, format_object_attrs) -from pandas.core.ops import make_invalid_op from pandas.core.strings import StringMethods +from pandas.io.formats.printing import ( + default_pprint, format_object_attrs, format_object_summary, pprint_thing) + __all__ = ['Index'] _unsortable_types = frozenset(('mixed', 'mixed-integer')) @@ -123,7 +104,8 @@ def index_arithmetic_method(self, other): elif isinstance(other, ABCTimedeltaIndex): # Defer to subclass implementation return NotImplemented - elif isinstance(other, np.ndarray) and is_timedelta64_dtype(other): + elif (isinstance(other, (np.ndarray, ABCTimedeltaArray)) and + is_timedelta64_dtype(other)): # GH#22390; wrap in Series for op, this will in turn wrap in # TimedeltaIndex, but will correctly raise TypeError instead of # NullFrequencyError for add/sub ops @@ -168,8 +150,9 @@ class InvalidIndexError(Exception): def _new_Index(cls, d): - """ This is called upon unpickling, rather than the default which doesn't - have arguments and breaks __new__ + """ + This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__. """ # required for backward compat, because PI can't be instantiated with # ordinals through __new__ GH #13277 @@ -182,7 +165,7 @@ def _new_Index(cls, d): class Index(IndexOpsMixin, PandasObject): """ Immutable ndarray implementing an ordered, sliceable set. The basic object - storing axis labels for all pandas objects + storing axis labels for all pandas objects. Parameters ---------- @@ -212,21 +195,31 @@ class Index(IndexOpsMixin, PandasObject): See Also --------- - RangeIndex : Index implementing a monotonic integer range + RangeIndex : Index implementing a monotonic integer range. CategoricalIndex : Index of :class:`Categorical` s. - MultiIndex : A multi-level, or hierarchical, Index - IntervalIndex : an Index of :class:`Interval` s. + MultiIndex : A multi-level, or hierarchical, Index. + IntervalIndex : An Index of :class:`Interval` s. DatetimeIndex, TimedeltaIndex, PeriodIndex Int64Index, UInt64Index, Float64Index """ # To hand over control to subclasses _join_precedence = 1 - # Cython methods - _left_indexer_unique = libjoin.left_join_indexer_unique_object - _left_indexer = libjoin.left_join_indexer_object - _inner_indexer = libjoin.inner_join_indexer_object - _outer_indexer = libjoin.outer_join_indexer_object + # Cython methods; see github.com/cython/cython/issues/2647 + # for why we need to wrap these instead of making them class attributes + # Moreover, cython will choose the appropriate-dtyped sub-function + # given the dtypes of the passed arguments + def _left_indexer_unique(self, left, right): + return libjoin.left_join_indexer_unique(left, right) + + def _left_indexer(self, left, right): + return libjoin.left_join_indexer(left, right) + + def _inner_indexer(self, left, right): + return libjoin.inner_join_indexer(left, right) + + def _outer_indexer(self, left, right): + return libjoin.outer_join_indexer(left, right) _typ = 'index' _data = None @@ -251,14 +244,21 @@ class Index(IndexOpsMixin, PandasObject): str = CachedAccessor("str", StringMethods) + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False, tupleize_cols=True, **kwargs): + fastpath=None, tupleize_cols=True, **kwargs): if name is None and hasattr(data, 'name'): name = data.name - if fastpath: - return cls._simple_new(data, name) + if fastpath is not None: + warnings.warn("The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, stacklevel=2) + if fastpath: + return cls._simple_new(data, name) from .range import RangeIndex @@ -287,11 +287,19 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, (dtype is not None and is_datetime64_any_dtype(dtype)) or 'tz' in kwargs): from pandas import DatetimeIndex - result = DatetimeIndex(data, copy=copy, name=name, - dtype=dtype, **kwargs) + if dtype is not None and is_dtype_equal(_o_dtype, dtype): - return Index(result.to_pydatetime(), dtype=_o_dtype) + # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, + # will raise in the where `data` is already tz-aware. So + # we leave it out of this step and cast to object-dtype after + # the DatetimeIndex construction. + # Note we can pass copy=False because the .astype below + # will always make a copy + result = DatetimeIndex(data, copy=False, name=name, **kwargs) + return result.astype(object) else: + result = DatetimeIndex(data, copy=copy, name=name, + dtype=dtype, **kwargs) return result elif (is_timedelta64_dtype(data) or @@ -303,6 +311,11 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, else: return result + elif is_period_dtype(data) and not is_object_dtype(dtype): + from pandas import PeriodIndex + result = PeriodIndex(data, copy=copy, name=name, **kwargs) + return result + # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): data = np.asarray(data) @@ -375,8 +388,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # maybe coerce to a sub-class from pandas.core.indexes.period import ( PeriodIndex, IncompatibleFrequency) - if isinstance(data, PeriodIndex): - return PeriodIndex(data, copy=copy, name=name, **kwargs) + if is_signed_integer_dtype(data.dtype): from .numeric import Int64Index return Int64Index(data, copy=copy, dtype=dtype, name=name) @@ -482,8 +494,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, @classmethod def _simple_new(cls, values, name=None, dtype=None, **kwargs): """ - we require the we have a dtype compat for the values - if we are passed a non-dtype compat, then coerce using the constructor + We require that we have a dtype compat for the values. If we are passed + a non-dtype compat, then coerce using the constructor. Must be careful not to recurse. """ @@ -496,6 +508,12 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): values = cls(values, name=name, dtype=dtype, **kwargs)._ndarray_values + if isinstance(values, (ABCSeries, ABCIndexClass)): + # Index._data must always be an ndarray. + # This is no-copy for when _values is an ndarray, + # which should be always at this point. + values = np.asarray(values._values) + result = object.__new__(cls) result._data = values result.name = name @@ -503,10 +521,23 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): setattr(result, k, v) return result._reset_identity() + @cache_readonly + def _constructor(self): + return type(self) + + # -------------------------------------------------------------------- + # Index Internals Methods + + def _get_attributes_dict(self): + """ + Return an attributes dict for my class. + """ + return {k: getattr(self, k, None) for k in self._attributes} + _index_shared_docs['_shallow_copy'] = """ - create a new Index with the same class as the caller, don't copy the + Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking - precedence + precedence. *this is an internal non-public method* @@ -527,14 +558,18 @@ def _shallow_copy(self, values=None, **kwargs): # _simple_new expects an ndarray values = getattr(values, 'values', values) + if isinstance(values, ABCDatetimeIndex): + # `self.values` returns `self` for tz-aware, so we need to unwrap + # more specifically + values = values.asi8 return self._simple_new(values, **attributes) def _shallow_copy_with_infer(self, values, **kwargs): """ - create a new Index inferring the class with passed value, don't copy + Create a new Index inferring the class with passed value, don't copy the data, use the same object attributes with passed in attributes - taking precedence + taking precedence. *this is an internal non-public method* @@ -557,11 +592,11 @@ def _shallow_copy_with_infer(self, values, **kwargs): def _deepcopy_if_needed(self, orig, copy=False): """ - .. versionadded:: 0.19.0 - Make a copy of self if data coincides (in memory) with orig. Subclasses should override this if self._base is not an ndarray. + .. versionadded:: 0.19.0 + Parameters ---------- orig : ndarray @@ -589,43 +624,9 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def _sort_levels_monotonic(self): - """ compat with MultiIndex """ - return self - - _index_shared_docs['_get_grouper_for_level'] = """ - Get index grouper corresponding to an index level - - Parameters - ---------- - mapper: Group mapping function or None - Function mapping index values to groups - level : int or None - Index level - - Returns - ------- - grouper : Index - Index of values to group on - labels : ndarray of int or None - Array of locations in level_index - uniques : Index or None - Index of unique values for level - """ - - @Appender(_index_shared_docs['_get_grouper_for_level']) - def _get_grouper_for_level(self, mapper, level=None): - assert level is None or level == 0 - if mapper is None: - grouper = self - else: - grouper = self.map(mapper) - - return grouper, None, None - def is_(self, other): """ - More flexible, faster check like ``is`` but that works through views + More flexible, faster check like ``is`` but that works through views. Note: this is *not* the same as ``Index.identical()``, which checks that metadata is also the same. @@ -644,24 +645,39 @@ def is_(self, other): other, '_id', Ellipsis) and self._id is not None def _reset_identity(self): - """Initializes or resets ``_id`` attribute with new object""" + """ + Initializes or resets ``_id`` attribute with new object. + """ self._id = _Identity() return self + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self._ndarray_values, len(self)) + + # -------------------------------------------------------------------- + # Array-Like Methods + # ndarray compat def __len__(self): """ - return the length of the Index + Return the length of the Index. """ return len(self._data) def __array__(self, dtype=None): - """ the array interface, return my values """ + """ + The array interface, return my values. + """ return self._data.view(np.ndarray) def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc + Gets called after a ufunc. """ if is_bool_dtype(result): return result @@ -672,106 +688,141 @@ def __array_wrap__(self, result, context=None): @cache_readonly def dtype(self): - """ return the dtype object of the underlying data """ + """ + Return the dtype object of the underlying data. + """ return self._data.dtype @cache_readonly def dtype_str(self): - """ return the dtype str of the underlying data """ + """ + Return the dtype str of the underlying data. + """ return str(self.dtype) - @property - def values(self): - """ return the underlying data as an ndarray """ - return self._data.view(np.ndarray) - - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index] - # TODO(EA): remove index types as they become extension arrays - """The best array representation. - - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. - - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. + def ravel(self, order='C'): + """ + Return an ndarray of the flattened values of the underlying data. - It may differ from the public '.values' method. + See Also + -------- + numpy.ndarray.ravel + """ + return self._ndarray_values.ravel(order=order) - index | values | _values | _ndarray_values | - ----------------- | -------------- -| ----------- | --------------- | - CategoricalIndex | Categorical | Categorical | codes | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + def view(self, cls=None): - For the following, the ``._values`` is currently ``ndarray[object]``, - but will soon be an ``ExtensionArray`` + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, '_typ'): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------ | --------------- | - PeriodIndex | ndarray[object] | ndarray[obj] | ndarray[int] | - IntervalIndex | ndarray[object] | ndarray[obj] | ndarray[object] | + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. - See Also - -------- - values - _ndarray_values - """ - return self.values + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. - def get_values(self): + .. versionadded:: 0.19.0 """ - Return `Index` data as an `numpy.ndarray`. - Returns - ------- - numpy.ndarray - A one-dimensional numpy array of the `Index` values. + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype): + return self.copy() if copy else self - See Also - -------- - Index.values : The attribute that get_values wraps. + elif is_categorical_dtype(dtype): + from .category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) - Examples - -------- - Getting the `Index` values of a `DataFrame`: + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) - >>> df - A B C - a 1 2 3 - b 4 5 6 - c 7 8 9 - >>> df.index.get_values() - array(['a', 'b', 'c'], dtype=object) + try: + if is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + return DatetimeIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - Standalone `Index` values: + _index_shared_docs['take'] = """ + Return a new %(klass)s of the values selected by the indices. - >>> idx = pd.Index(['1', '2', '3']) - >>> idx.get_values() - array(['1', '2', '3'], dtype=object) + For internal compatibility with numpy arrays. - `MultiIndex` arrays also have only one dimension: + Parameters + ---------- + indices : list + Indices to be taken + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : bool, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError - >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], - ... names=('number', 'letter')) - >>> midx.get_values() - array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) - >>> midx.get_values().ndim - 1 + See Also + -------- + numpy.ndarray.take """ - return self.values - @Appender(IndexOpsMixin.memory_usage.__doc__) - def memory_usage(self, deep=False): - result = super(Index, self).memory_usage(deep=deep) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + if kwargs: + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + if self._can_hold_na: + taken = self._assert_take_fillable(self.values, indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value) + else: + if allow_fill and fill_value is not None: + msg = 'Unable to fill values because {0} cannot contain NA' + raise ValueError(msg.format(self.__class__.__name__)) + taken = self.values.take(indices) + return self._shallow_copy(taken) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) - return result + def _assert_take_fillable(self, values, indices, allow_fill=True, + fill_value=None, na_value=np.nan): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + taken = algos.take(values, + indices, + allow_fill=allow_fill, + fill_value=na_value) + else: + taken = values.take(indices) + return taken - # ops compat def repeat(self, repeats, *args, **kwargs): """ Repeat elements of an Index. @@ -794,8 +845,8 @@ def repeat(self, repeats, *args, **kwargs): See Also -------- - Series.repeat : Equivalent function for Series - numpy.repeat : Underlying implementation + Series.repeat : Equivalent function for Series. + numpy.repeat : Underlying implementation. Examples -------- @@ -810,155 +861,8 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) - _index_shared_docs['where'] = """ - .. versionadded:: 0.19.0 - - Return an Index of same shape as self and whose corresponding - entries are from self where cond is True and otherwise are from - other. - - Parameters - ---------- - cond : boolean array-like with the same length as self - other : scalar, or array-like - """ - - @Appender(_index_shared_docs['where']) - def where(self, cond, other=None): - if other is None: - other = self._na_value - - dtype = self.dtype - values = self.values - - if is_bool(other) or is_bool_dtype(other): - - # bools force casting - values = values.astype(object) - dtype = None - - values = np.where(cond, values, other) - - if self._is_numeric_dtype and np.any(isna(values)): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return self._shallow_copy_with_infer(values, dtype=dtype) - - def ravel(self, order='C'): - """ - return an ndarray of the flattened values of the underlying data - - See also - -------- - numpy.ndarray.ravel - """ - return self._ndarray_values.ravel(order=order) - - # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - - @classmethod - def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) - - @classmethod - def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') - - @classmethod - def _coerce_to_ndarray(cls, data): - """coerces data to ndarray, raises on scalar data. Converts other - iterables to list first and then to array. Does not touch ndarrays. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - - def _get_attributes_dict(self): - """ return an attributes dict for my class """ - return {k: getattr(self, k, None) for k in self._attributes} - - def view(self, cls=None): - - # we need to see if we are subclassing an - # index type here - if cls is not None and not hasattr(cls, '_typ'): - result = self._data.view(cls) - else: - result = self._shallow_copy() - if isinstance(result, Index): - result._id = self._id - return result - - def _coerce_scalar_to_index(self, item): - """ - we need to coerce a scalar to a compat for our index type - - Parameters - ---------- - item : scalar item to coerce - """ - dtype = self.dtype - - if self._is_numeric_dtype and isna(item): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index([item], dtype=dtype, **self._get_attributes_dict()) + # -------------------------------------------------------------------- + # Copying Methods _index_shared_docs['copy'] = """ Make a copy of this object. Name and dtype sets those attributes on @@ -1009,24 +913,8 @@ def __deepcopy__(self, memo=None): memo = {} return self.copy(deep=True) - def _validate_names(self, name=None, names=None, deep=False): - """ - Handles the quirks of having a singular 'name' parameter for general - Index and plural 'names' parameter for MultiIndex. - """ - from copy import deepcopy - if names is not None and name is not None: - raise TypeError("Can only provide one of `names` and `name`") - elif names is None and name is None: - return deepcopy(self.names) if deep else self.names - elif names is not None: - if not is_list_like(names): - raise TypeError("Must pass list-like as `names`.") - return names - else: - if not is_list_like(name): - return [name] - return name + # -------------------------------------------------------------------- + # Rendering Methods def __unicode__(self): """ @@ -1064,13 +952,13 @@ def _format_space(self): @property def _formatter_func(self): """ - Return the formatter function + Return the formatter function. """ return default_pprint def _format_data(self, name=None): """ - Return the formatted data as a unicode string + Return the formatted data as a unicode string. """ # do we want to justify (only do so for non-objects) @@ -1083,170 +971,267 @@ def _format_data(self, name=None): def _format_attrs(self): """ - Return a list of tuples of the (attr,formatted_value) + Return a list of tuples of the (attr,formatted_value). """ return format_object_attrs(self) - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index - - Parameters - ---------- - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values - Returns - ------- - Series : dtype will be based on the type of the Index values. + def format(self, name=False, formatter=None, **kwargs): """ - - from pandas import Series - - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name - - return Series(self.values.copy(), index=index, name=name) - - def to_frame(self, index=True, name=None): + Render a string representation of the Index. """ - Create a DataFrame with a column containing the Index. + header = [] + if name: + header.append(pprint_thing(self.name, + escape_chars=('\t', '\r', '\n')) if + self.name is not None else '') - .. versionadded:: 0.24.0 + if formatter is not None: + return header + list(self.map(formatter)) - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index. + return self._format_with_header(header, **kwargs) - name : object, default None - The passed name should substitute for the index name (if it has - one). + def _format_with_header(self, header, na_rep='NaN', **kwargs): + values = self.values - Returns - ------- - DataFrame - DataFrame containing the original Index data. + from pandas.io.formats.format import format_array - See Also - -------- - Index.to_series : Convert an Index to a Series. - Series.to_frame : Convert Series to DataFrame. + if is_categorical_dtype(values.dtype): + values = np.array(values) - Examples - -------- - >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') - >>> idx.to_frame() - animal - animal - Ant Ant - Bear Bear - Cow Cow + elif is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, safe=1) - By default, the original Index is reused. To enforce a new Index: + if is_object_dtype(values.dtype): + result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) + for x in values] - >>> idx.to_frame(index=False) - animal - 0 Ant - 1 Bear - 2 Cow + # could have nans + mask = isna(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() - To override the name of the resulting column, specify `name`: + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result - >>> idx.to_frame(index=False, name='zoo') - zoo - 0 Ant - 1 Bear - 2 Cow + def to_native_types(self, slicer=None, **kwargs): """ + Format specified values of `self` and return them. - from pandas import DataFrame - if name is None: - name = self.name or 0 - result = DataFrame({name: self.values.copy()}) + Parameters + ---------- + slicer : int, array-like + An indexer into `self` that specifies which values + are used in the formatting process. + kwargs : dict + Options for specifying how the values should be formatted. + These options include the following: - if index: - result.index = self - return result + 1) na_rep : str + The value that serves as a placeholder for NULL values + 2) quoting : bool or None + Whether or not there are quoted values in `self` + 3) date_format : str + The format used to represent date-like values + """ - _index_shared_docs['astype'] = """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) + + def _format_native_types(self, na_rep='', quoting=None, **kwargs): + """ + Actually format specific types of the index. + """ + mask = isna(self) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + + values[mask] = na_rep + return values + + def _summary(self, name=None): + """ + Return a summarized representation. Parameters ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. + name : str + name to use in the summary representation - .. versionadded:: 0.19.0 + Returns + ------- + String with a summarized representation of the index + """ + if len(self) > 0: + head = self[0] + if (hasattr(head, 'format') and + not isinstance(head, compat.string_types)): + head = head.format() + tail = self[-1] + if (hasattr(tail, 'format') and + not isinstance(tail, compat.string_types)): + tail = tail.format() + index_summary = ', %s to %s' % (pprint_thing(head), + pprint_thing(tail)) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + return '%s: %s entries%s' % (name, len(self), index_summary) + def summary(self, name=None): """ + Return a summarized representation. - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype): - return self.copy() if copy else self + .. deprecated:: 0.23.0 + """ + warnings.warn("'summary' is deprecated and will be removed in a " + "future version.", FutureWarning, stacklevel=2) + return self._summary(name) - elif is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) + # -------------------------------------------------------------------- + # Conversion Methods - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) + def to_flat_index(self): + """ + Identity method. - try: - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) - except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + .. versionadded:: 0.24.0 - def _to_safe_for_reshape(self): - """ convert to object if we are a categorical """ + This is implemented for compatability with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ return self - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index. - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name - else: - result_name = self.name if self.name == other.name else None - return other, result_name + Parameters + ---------- + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index - def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - return value + Returns + ------- + Series : dtype will be based on the type of the Index values. + """ - def _assert_can_do_op(self, value): - """ Check value is valid for scalar op """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) + from pandas import Series - @property - def nlevels(self): - return 1 + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self.values.copy(), index=index, name=name) + + def to_frame(self, index=True, name=None): + """ + Create a DataFrame with a column containing the Index. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index. + + name : object, default None + The passed name should substitute for the index name (if it has + one). + + Returns + ------- + DataFrame + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + + from pandas import DataFrame + if name is None: + name = self.name or 0 + result = DataFrame({name: self.values.copy()}) + + if index: + result.index = self + return result + + # -------------------------------------------------------------------- + # Name-Centric Methods + + def _validate_names(self, name=None, names=None, deep=False): + """ + Handles the quirks of having a singular 'name' parameter for general + Index and plural 'names' parameter for MultiIndex. + """ + from copy import deepcopy + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + elif names is None and name is None: + return deepcopy(self.names) if deep else self.names + elif names is not None: + if not is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + return names + else: + if not is_list_like(name): + return [name] + return name def _get_names(self): return FrozenList((self.name, )) @@ -1405,68 +1390,204 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + # -------------------------------------------------------------------- + # Level-Centric Methods + @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False + def nlevels(self): + return 1 - def _summary(self, name=None): + def _sort_levels_monotonic(self): + """ + Compat with MultiIndex. """ - Return a summarized representation + return self - Parameters - ---------- - name : str - name to use in the summary representation + def _validate_index_level(self, level): + """ + Validate index level. + + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. - Returns - ------- - String with a summarized representation of the index """ - if len(self) > 0: - head = self[0] - if (hasattr(head, 'format') and - not isinstance(head, compat.string_types)): - head = head.format() - tail = self[-1] - if (hasattr(tail, 'format') and - not isinstance(tail, compat.string_types)): - tail = tail.format() - index_summary = ', %s to %s' % (pprint_thing(head), - pprint_thing(tail)) - else: - index_summary = '' + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError("Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level, )) + elif level > 0: + raise IndexError("Too many levels:" + " Index has only 1 level, not %d" % + (level + 1)) + elif level != self.name: + raise KeyError('Level %s must be same as name (%s)' % + (level, self.name)) - if name is None: - name = type(self).__name__ - return '%s: %s entries%s' % (name, len(self), index_summary) + def _get_level_number(self, level): + self._validate_index_level(level) + return 0 - def summary(self, name=None): + def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ - Return a summarized representation - .. deprecated:: 0.23.0 + For internal compatibility with with the Index API. + + Sort the Index. This is for compat with MultiIndex + + Parameters + ---------- + ascending : boolean, default True + False to sort in descending order + + level, sort_remaining are compat parameters + + Returns + ------- + sorted_index : Index """ - warnings.warn("'summary' is deprecated and will be removed in a " - "future version.", FutureWarning, stacklevel=2) - return self._summary(name) + return self.sort_values(return_indexer=True, ascending=ascending) - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.values + def _get_level_values(self, level): + """ + Return an Index of values for requested level. - _na_value = np.nan - """The expected NA value to use with this index.""" + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatability. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + values : Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + index : Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError("Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels)) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_codes = list(self.codes) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_codes.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_codes[0] == -1 + result = new_levels[0].take(new_codes[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + from .multi import MultiIndex + return MultiIndex(levels=new_levels, codes=new_codes, + names=new_names, verify_integrity=False) + + _index_shared_docs['_get_grouper_for_level'] = """ + Get index grouper corresponding to an index level + + Parameters + ---------- + mapper: Group mapping function or None + Function mapping index values to groups + level : int or None + Index level + + Returns + ------- + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + uniques : Index or None + Index of unique values for level + """ + + @Appender(_index_shared_docs['_get_grouper_for_level']) + def _get_grouper_for_level(self, mapper, level=None): + assert level is None or level == 0 + if mapper is None: + grouper = self + else: + grouper = self.map(mapper) + + return grouper, None, None + + # -------------------------------------------------------------------- + # Introspection Methods - # introspection @property def is_monotonic(self): - """ alias for is_monotonic_increasing (deprecated) """ + """ + Alias for is_monotonic_increasing. + """ return self.is_monotonic_increasing @property def is_monotonic_increasing(self): """ - return if the index is monotonic increasing (only equal or + Return if the index is monotonic increasing (only equal or increasing) values. Examples @@ -1483,7 +1604,7 @@ def is_monotonic_increasing(self): @property def is_monotonic_decreasing(self): """ - return if the index is monotonic decreasing (only equal or + Return if the index is monotonic decreasing (only equal or decreasing) values. Examples @@ -1499,8 +1620,9 @@ def is_monotonic_decreasing(self): @property def _is_strictly_monotonic_increasing(self): - """return if the index is strictly monotonic increasing - (only increasing) values + """ + Return if the index is strictly monotonic increasing + (only increasing) values. Examples -------- @@ -1515,8 +1637,9 @@ def _is_strictly_monotonic_increasing(self): @property def _is_strictly_monotonic_decreasing(self): - """return if the index is strictly monotonic decreasing - (only decreasing) values + """ + Return if the index is strictly monotonic decreasing + (only decreasing) values. Examples -------- @@ -1534,7 +1657,9 @@ def is_lexsorted_for_tuple(self, tup): @cache_readonly def is_unique(self): - """ return if the index has unique values """ + """ + Return if the index has unique values. + """ return self._engine.is_unique @property @@ -1601,232 +1726,385 @@ def is_mixed(self): def holds_integer(self): return self.inferred_type in ['integer', 'mixed-integer'] - _index_shared_docs['_convert_scalar_indexer'] = """ - Convert a scalar indexer. + @cache_readonly + def inferred_type(self): + """ + Return a string of the type inferred from the values. + """ + return lib.infer_dtype(self) - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ + @cache_readonly + def is_all_dates(self): + if self._data is None: + return False + return is_datetime_array(ensure_object(self.values)) - @Appender(_index_shared_docs['_convert_scalar_indexer']) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + # -------------------------------------------------------------------- + # Pickle Methods - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None - if len(self) and not isinstance(self, ABCMultiIndex,): + def __setstate__(self, state): + """ + Necessary for making this object picklable. + """ - # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) - # or label indexing if we are using a type able - # to be represented in the index + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) - if kind in ['getitem', 'ix'] and is_float(key): - if not self.is_floating(): - return self._invalid_indexer('label', key) + elif isinstance(state, tuple): - elif kind in ['loc'] and is_float(key): + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] - # we want to raise KeyError on string/mixed here - # technically we *could* raise a TypeError - # on anything but mixed though - if self.inferred_type not in ['floating', - 'mixed-integer-float', - 'string', - 'unicode', - 'mixed']: - return self._invalid_indexer('label', key) + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) - elif kind in ['loc'] and is_integer(key): - if not self.holds_integer(): - return self._invalid_indexer('label', key) + self._data = data + self._reset_identity() + else: + raise Exception("invalid pickle state") - return key + _unpickle_compat = __setstate__ - _index_shared_docs['_convert_slice_indexer'] = """ - Convert a slice indexer. + # -------------------------------------------------------------------- + # Null Handling Methods - By definition, these are labels unless 'iloc' is passed in. - Floats are not allowed as the start, step, or stop of the slice. + _na_value = np.nan + """The expected NA value to use with this index.""" - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ + @cache_readonly + def _isnan(self): + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values - @Appender(_index_shared_docs['_convert_slice_indexer']) - def _convert_slice_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + w, = self._isnan.nonzero() + return w + else: + return np.array([], dtype=np.int64) - # if we are not a slice, then we are done - if not isinstance(key, slice): - return key + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False - # validate iloc - if kind == 'iloc': - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + def isna(self): + """ + Detect missing values. - # potentially cast the bounds to integers - start, stop, step = key.start, key.stop, key.step + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). - # figure out if this is a positional indexer - def is_int(v): - return v is None or is_integer(v) + .. versionadded:: 0.20.0 - is_null_slicer = start is None and stop is None - is_index_slice = is_int(start) and is_int(stop) - is_positional = is_index_slice and not self.is_integer() + Returns + ------- + numpy.ndarray + A boolean array of whether my values are NA - if kind == 'getitem': - """ - called from the getitem slicers, validate that we are in fact - integers - """ - if self.is_integer() or is_index_slice: - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + See Also + -------- + pandas.Index.notna : Boolean inverse of isna. + pandas.Index.dropna : Omit entries with missing values. + pandas.isna : Top-level isna. + Series.isna : Detect missing values in Series object. - # convert the slice to an indexer here + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. - # if we are mixed and have integers - try: - if is_positional and self.is_mixed(): - # Validate start & stop - if start is not None: - self.get_loc(start) - if stop is not None: - self.get_loc(stop) - is_positional = False - except KeyError: - if self.inferred_type == 'mixed-integer-float': - raise + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True], dtype=bool) - if is_null_slicer: - indexer = key - elif is_positional: - indexer = key - else: - try: - indexer = self.slice_indexer(start, stop, step, kind=kind) - except Exception: - if is_index_slice: - if self.is_integer(): - raise - else: - indexer = key - else: - raise + Empty strings are not considered NA values. None is considered an NA + value. - return indexer + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True], dtype=bool) - def _convert_listlike_indexer(self, keyarr, kind=None): + For datetimes, `NaT` (Not a Time) is considered as an NA value. + + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True], dtype=bool) """ - Parameters - ---------- - keyarr : list-like - Indexer to convert. + return self._isnan + isnull = isna + + def notna(self): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + .. versionadded:: 0.20.0 Returns ------- - tuple (indexer, keyarr) - indexer is an ndarray or None if cannot convert - keyarr are tuple-safe keys - """ - if isinstance(keyarr, Index): - keyarr = self._convert_index_indexer(keyarr) - else: - keyarr = self._convert_arr_indexer(keyarr) + numpy.ndarray + Boolean array to indicate which entries are not NA. - indexer = self._convert_list_indexer(keyarr, kind=kind) - return indexer, keyarr + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + pandas.notna : Top-level notna. - _index_shared_docs['_convert_arr_indexer'] = """ - Convert an array-like indexer to the appropriate dtype. + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) + """ + return ~self.isna() + notnull = notna + + _index_shared_docs['fillna'] = """ + Fill NA/NaN values with the specified value Parameters ---------- - keyarr : array-like - Indexer to convert. + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) Returns ------- - converted_keyarr : array-like - """ + filled : %(klass)s + """ - @Appender(_index_shared_docs['_convert_arr_indexer']) - def _convert_arr_indexer(self, keyarr): - keyarr = com.asarray_tuplesafe(keyarr) - return keyarr + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + return self._shallow_copy() - _index_shared_docs['_convert_index_indexer'] = """ - Convert an Index indexer to the appropriate dtype. + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values Parameters ---------- - keyarr : Index (or sub-class) - Indexer to convert. + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. Returns ------- - converted_keyarr : Index (or sub-class) - """ + valid : Index + """ - @Appender(_index_shared_docs['_convert_index_indexer']) - def _convert_index_indexer(self, keyarr): - return keyarr + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) - _index_shared_docs['_convert_list_indexer'] = """ - Convert a list-like indexer to the appropriate dtype. + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() + + # -------------------------------------------------------------------- + # Uniqueness Methods + + _index_shared_docs['index_unique'] = ( + """ + Return unique values in the index. Uniques are returned in order + of appearance, this does NOT sort. Parameters ---------- - keyarr : Index (or sub-class) - Indexer to convert. - kind : iloc, ix, loc, optional + level : int or str, optional, default None + Only return values from specified level (for MultiIndex) + + .. versionadded:: 0.23.0 Returns ------- - positional indexer or None - """ + Index without duplicates - @Appender(_index_shared_docs['_convert_list_indexer']) - def _convert_list_indexer(self, keyarr, kind=None): - if (kind in [None, 'iloc', 'ix'] and - is_integer_dtype(keyarr) and not self.is_floating() and - not isinstance(keyarr, ABCPeriodIndex)): + See Also + -------- + unique + Series.unique + """) - if self.inferred_type == 'mixed-integer': - indexer = self.get_indexer(keyarr) - if (indexer >= 0).all(): - return indexer - # missing values are flagged as -1 by get_indexer and negative - # indices are already converted to positive indices in the - # above if-statement, so the negative flags are changed to - # values outside the range of indices so as to trigger an - # IndexError in maybe_convert_indices - indexer[indexer < 0] = len(self) - from pandas.core.indexing import maybe_convert_indices - return maybe_convert_indices(indexer, len(self)) + @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + result = super(Index, self).unique() + return self._shallow_copy(result) - elif not self.inferred_type == 'integer': - keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) - return keyarr + def drop_duplicates(self, keep='first'): + """ + Return Index with duplicate values removed. - return None + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. - def _invalid_indexer(self, form, key): - """ consistent invalid indexer message """ - raise TypeError("cannot do {form} indexing on {klass} with these " - "indexers [{key}] of {kind}".format( - form=form, klass=type(self), key=key, - kind=type(key))) + Returns + ------- + deduplicated : Index + + See Also + -------- + Series.drop_duplicates : Equivalent method on Series. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Index.duplicated : Related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ + return super(Index, self).drop_duplicates(keep=keep) + + def duplicated(self, keep='first'): + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first') + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last') + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False) + array([ True, False, True, False, True]) + + Returns + ------- + numpy.ndarray + + See Also + -------- + pandas.Series.duplicated : Equivalent method on pandas.Series. + pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame. + pandas.Index.drop_duplicates : Remove duplicate values from Index. + """ + return super(Index, self).duplicated(keep=keep) def get_duplicates(self): """ @@ -1853,12 +2131,8 @@ def get_duplicates(self): Works on different Index of types. - >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() + >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() # doctest: +SKIP [2, 3] - >>> pd.Index([1., 2., 2., 3., 3., 3., 4.]).get_duplicates() - [2.0, 3.0] - >>> pd.Index(['a', 'b', 'b', 'c', 'c', 'c', 'd']).get_duplicates() - ['b', 'c'] Note that for a DatetimeIndex, it does not return a list but a new DatetimeIndex: @@ -1866,22 +2140,22 @@ def get_duplicates(self): >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03', ... '2018-01-03', '2018-01-04', '2018-01-04'], ... format='%Y-%m-%d') - >>> pd.Index(dates).get_duplicates() + >>> pd.Index(dates).get_duplicates() # doctest: +SKIP DatetimeIndex(['2018-01-03', '2018-01-04'], dtype='datetime64[ns]', freq=None) Sorts duplicated elements even when indexes are unordered. - >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() + >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() # doctest: +SKIP [2, 3] Return empty array-like structure when all elements are unique. - >>> pd.Index([1, 2, 3, 4]).get_duplicates() + >>> pd.Index([1, 2, 3, 4]).get_duplicates() # doctest: +SKIP [] >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], ... format='%Y-%m-%d') - >>> pd.Index(dates).get_duplicates() + >>> pd.Index(dates).get_duplicates() # doctest: +SKIP DatetimeIndex([], dtype='datetime64[ns]', freq=None) """ warnings.warn("'get_duplicates' is deprecated and will be removed in " @@ -1891,91 +2165,65 @@ def get_duplicates(self): return self[self.duplicated()].unique() - def _cleanup(self): - self._engine.clear_mapping() - - @cache_readonly - def _constructor(self): - return type(self) - - @cache_readonly - def _engine(self): - # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) - - def _validate_index_level(self, level): + def _get_unique_index(self, dropna=False): """ - Validate index level. + Returns an index containing unique values. - For single-level Index getting level number is a no-op, but some - verification must be done like in MultiIndex. + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + Returns + ------- + uniques : index """ - if isinstance(level, int): - if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) - elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) - elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) + if self.is_unique and not dropna: + return self - def _get_level_number(self, level): - self._validate_index_level(level) - return 0 + values = self.values - @cache_readonly - def inferred_type(self): - """ return a string of the type inferred from the values """ - return lib.infer_dtype(self) + if not self.is_unique: + values = self.unique() - def _is_memory_usage_qualified(self): - """ return a boolean if we need a qualified .info display """ - return self.is_object() + if dropna: + try: + if self.hasnans: + values = values[~isna(values)] + except NotImplementedError: + pass - def is_type_compatible(self, kind): - return kind == self.inferred_type + return self._shallow_copy(values) - @cache_readonly - def is_all_dates(self): - if self._data is None: - return False - return is_datetime_array(ensure_object(self.values)) + # -------------------------------------------------------------------- + # Arithmetic & Logical Methods - def __reduce__(self): - d = dict(data=self._data) - d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None + def __add__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + return Index(np.array(self) + other) - def __setstate__(self, state): - """Necessary for making this object picklable""" + def __radd__(self, other): + return Index(other + np.array(self)) - if isinstance(state, dict): - self._data = state.pop('data') - for k, v in compat.iteritems(state): - setattr(self, k, v) + def __iadd__(self, other): + # alias for __add__ + return self + other - elif isinstance(state, tuple): + def __sub__(self, other): + return Index(np.array(self) - other) - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] + def __rsub__(self, other): + return Index(other - np.array(self)) - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) + def __and__(self, other): + return self.intersection(other) - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") + def __or__(self, other): + return self.union(other) - _unpickle_compat = __setstate__ + def __xor__(self, other): + return self.symmetric_difference(other) def __nonzero__(self): raise ValueError("The truth value of a {0} is ambiguous. " @@ -1984,2175 +2232,2319 @@ def __nonzero__(self): __bool__ = __nonzero__ - _index_shared_docs['__contains__'] = """ - return a boolean if this key is IN the index + # -------------------------------------------------------------------- + # Set Operation Methods + + def _get_reconciled_name_object(self, other): + """ + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. + """ + name = get_op_result_name(self, other) + if self.name != name: + return self._shallow_copy(name=name) + return self + + def union(self, other): + """ + Form the union of two Index objects and sorts if possible. Parameters ---------- - key : object + other : Index or array-like Returns ------- - boolean - """ - - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) - def __contains__(self, key): - hash(key) - try: - return key in self._engine - except (OverflowError, TypeError, ValueError): - return False + union : Index - _index_shared_docs['contains'] = """ - return a boolean if this key is IN the index - - Parameters - ---------- - key : object + Examples + -------- - Returns - ------- - boolean + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') """ + self._assert_can_do_setop(other) + other = ensure_index(other) - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) - def contains(self, key): - hash(key) - try: - return key in self._engine - except (TypeError, ValueError): - return False + if len(other) == 0 or self.equals(other): + return self._get_reconciled_name_object(other) - def __hash__(self): - raise TypeError("unhashable type: %r" % type(self).__name__) + if len(self) == 0: + return other._get_reconciled_name_object(self) - def __setitem__(self, key, value): - raise TypeError("Index does not support mutable operations") + # TODO: is_dtype_union_equal is a hack around + # 1. buggy set ops with duplicates (GH #13432) + # 2. CategoricalIndex lacking setops (GH #10186) + # Once those are fixed, this workaround can be removed + if not is_dtype_union_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.union(other) - def __getitem__(self, key): - """ - Override numpy.ndarray's __getitem__ method to work as desired. + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values - This function adds lists and Series as valid boolean indexers - (ndarrays only supports ndarray with dtype=bool). + if self.is_monotonic and other.is_monotonic: + try: + result = self._outer_indexer(lvals, rvals)[0] + except TypeError: + # incomparable objects + result = list(lvals) - If resulting ndim != 1, plain ndarray is returned instead of - corresponding `Index` subclass. + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + else: + indexer = self.get_indexer(other) + indexer, = (indexer == -1).nonzero() - """ - # There's no custom logic to be implemented in __getslice__, so it's - # not overloaded intentionally. - getitem = self._data.__getitem__ - promote = self._shallow_copy + if len(indexer) > 0: + other_diff = algos.take_nd(rvals, indexer, + allow_fill=False) + result = _concat._concat_compat((lvals, other_diff)) - if is_scalar(key): - key = com.cast_scalar_indexer(key) - return getitem(key) + try: + lvals[0] < other_diff[0] + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) + else: + types = frozenset((self.inferred_type, + other.inferred_type)) + if not types & _unsortable_types: + result.sort() - if isinstance(key, slice): - # This case is separated from the conditional above to avoid - # pessimization of basic indexing. - return promote(getitem(key)) + else: + result = lvals - if com.is_bool_indexer(key): - key = np.asarray(key) + try: + result = np.sort(result) + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) - key = com.values_from_object(key) - result = getitem(key) - if not is_scalar(result): - return promote(result) - else: - return result + # for subclasses + return self._wrap_setop_result(other, result) - def _can_hold_identifiers_and_holds_name(self, name): - """ - Faster check for ``name in self`` when we know `name` is a Python - identifier (e.g. in NDFrame.__getattr__, which hits this to support - . key lookup). For indexes that can't hold identifiers (everything - but object & categorical) we just return False. + def _wrap_setop_result(self, other, result): + return self._constructor(result, name=get_op_result_name(self, other)) - https://github.com/pandas-dev/pandas/issues/19764 + def intersection(self, other): """ - if self.is_object() or self.is_categorical(): - return name in self - return False + Form the intersection of two Index objects. - def append(self, other): - """ - Append a collection of Index options together + This returns a new Index with elements common to the index and `other`, + preserving the order of the calling index. Parameters ---------- - other : Index or list/tuple of indices + other : Index or array-like Returns ------- - appended : Index - """ + intersection : Index - to_concat = [self] + Examples + -------- - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Int64Index([3, 4], dtype='int64') + """ + self._assert_can_do_setop(other) + other = ensure_index(other) - for obj in to_concat: - if not isinstance(obj, Index): - raise TypeError('all inputs must be Index') + if self.equals(other): + return self._get_reconciled_name_object(other) - names = {obj.name for obj in to_concat} - name = None if len(names) > 1 else self.name + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.intersection(other) - return self._concat(to_concat, name) + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values - def _concat(self, to_concat, name): + if self.is_monotonic and other.is_monotonic: + try: + result = self._inner_indexer(lvals, rvals)[0] + return self._wrap_setop_result(other, result) + except TypeError: + pass - typs = _concat.get_dtype_kinds(to_concat) + try: + indexer = Index(rvals).get_indexer(lvals) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except Exception: + # duplicates + indexer = algos.unique1d( + Index(rvals).get_indexer_non_unique(lvals)[0]) + indexer = indexer[indexer != -1] - if len(typs) == 1: - return self._concat_same_dtype(to_concat, name=name) - return _concat._concat_index_asobject(to_concat, name=name) + taken = other.take(indexer) + if self.name != other.name: + taken.name = None + return taken - def _concat_same_dtype(self, to_concat, name): + def difference(self, other, sort=True): """ - Concatenate to_concat which has the same class - """ - # must be overridden in specific classes - return _concat._concat_index_asobject(to_concat, name) - - _index_shared_docs['take'] = """ - return a new %(klass)s of the values selected by the indices + Return a new Index with elements from the index that are not in + `other`. - For internal compatibility with numpy arrays. + This is the set difference of two Index objects. Parameters ---------- - indices : list - Indices to be taken - axis : int, optional - The axis over which to select values, always 0. - allow_fill : bool, default True - fill_value : bool, default None - If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + other : Index or array-like + sort : bool, default True + Sort the resulting index if possible + + .. versionadded:: 0.24.0 + + Returns + ------- + difference : Index - See also + Examples -------- - numpy.ndarray.take + + >>> idx1 = pd.Index([2, 1, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.difference(idx2) + Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') """ + self._assert_can_do_setop(other) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - if kwargs: - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) - else: - if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' - raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) - return self._shallow_copy(taken) + if self.equals(other): + # pass an empty np.ndarray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): - """ Internal method to handle NA filling of take """ - indices = ensure_platform_int(indices) + other, result_name = self._convert_can_do_setop(other) - # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) - else: - taken = values.take(indices) - return taken + this = self._get_unique_index() - @cache_readonly - def _isnan(self): - """ return if each value is nan""" - if self._can_hold_na: - return isna(self) - else: - # shouldn't reach to this condition by checking hasnans beforehand - values = np.empty(len(self), dtype=np.bool_) - values.fill(False) - return values + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) - @cache_readonly - def _nan_idxs(self): - if self._can_hold_na: - w, = self._isnan.nonzero() - return w - else: - return np.array([], dtype=np.int64) + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - @cache_readonly - def hasnans(self): - """ return if I have any nans; enables various perf speedups """ - if self._can_hold_na: - return self._isnan.any() - else: - return False + return this._shallow_copy(the_diff, name=result_name, freq=None) - def isna(self): + def symmetric_difference(self, other, result_name=None): """ - Detect missing values. + Compute the symmetric difference of two Index objects. - Return a boolean same-sized object indicating if the values are NA. - NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get - mapped to ``True`` values. - Everything else get mapped to ``False`` values. Characters such as - empty strings `''` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). + It's sorted if sorting is possible. - .. versionadded:: 0.20.0 + Parameters + ---------- + other : Index or array-like + result_name : str Returns ------- - numpy.ndarray - A boolean array of whether my values are NA + symmetric_difference : Index - See Also - -------- - pandas.Index.notna : boolean inverse of isna. - pandas.Index.dropna : omit entries with missing values. - pandas.isna : top-level isna. - Series.isna : detect missing values in Series object. + Notes + ----- + ``symmetric_difference`` contains elements that appear in either + ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. Examples -------- - Show which entries in a pandas.Index are NA. The result is an - array. + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([2, 3, 4, 5]) + >>> idx1.symmetric_difference(idx2) + Int64Index([1, 5], dtype='int64') - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.isna() - array([False, False, True], dtype=bool) + You can also use the ``^`` operator: - Empty strings are not considered NA values. None is considered an NA - value. + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.isna() - array([False, False, False, True], dtype=bool) + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) - For datetimes, `NaT` (Not a Time) is considered as an NA value. + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) - >>> idx - DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) - >>> idx.isna() - array([False, True, True, True], dtype=bool) - """ - return self._isnan - isnull = isna + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) - def notna(self): - """ - Detect existing (non-missing) values. + the_diff = _concat._concat_compat([left_diff, right_diff]) + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - Return a boolean same-sized object indicating if the values are not NA. - Non-missing values get mapped to ``True``. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` - values. + attribs = self._get_attributes_dict() + attribs['name'] = result_name + if 'freq' in attribs: + attribs['freq'] = None + return self._shallow_copy_with_infer(the_diff, **attribs) - .. versionadded:: 0.20.0 + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True - Returns - ------- - numpy.ndarray - Boolean array to indicate which entries are not NA. + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name - See also - -------- - Index.notnull : alias of notna - Index.isna: inverse of notna - pandas.notna : top-level notna + # -------------------------------------------------------------------- + # Indexing Methods - Examples - -------- - Show which entries in an Index are not NA. The result is an - array. + _index_shared_docs['get_loc'] = """ + Get integer location, slice or boolean mask for requested label. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.notna() - array([ True, True, False]) + Parameters + ---------- + key : label + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + tolerance : optional + Maximum distance from index value for inexact matches. The value of + the index at the matching location most satisfy the equation + ``abs(index[loc] - key) <= tolerance``. - Empty strings are not considered NA values. None is considered a NA - value. + Tolerance may be a scalar + value, which applies the same tolerance to all values, or + list-like, which applies variable tolerance per element. List-like + includes list, tuple, array, Series, and must be the same size as + the index and its dtype must exactly match the index's type. - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.notna() - array([ True, True, True, False]) - """ - return ~self.isna() - notnull = notna + .. versionadded:: 0.21.0 (list-like tolerance) - def putmask(self, mask, value): - """ - return a new Index of the values set with the mask + Returns + ------- + loc : int if unique index, slice if monotonic index, else mask - See also - -------- - numpy.ndarray.putmask - """ - values = self.values.copy() - try: - np.putmask(values, mask, self._convert_for_op(value)) - return self._shallow_copy(values) - except (ValueError, TypeError) as err: - if is_object_dtype(self): - raise err + Examples + --------- + >>> unique_index = pd.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 - # coerces to object - return self.astype(object).putmask(mask, value) + >>> monotonic_index = pd.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) - def format(self, name=False, formatter=None, **kwargs): - """ - Render a string representation of the Index + >>> non_monotonic_index = pd.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True], dtype=bool) """ - header = [] - if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') - if formatter is not None: - return header + list(self.map(formatter)) - - return self._format_with_header(header, **kwargs) + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if method is None: + if tolerance is not None: + raise ValueError('tolerance argument only valid if using pad, ' + 'backfill or nearest lookups') + try: + return self._engine.get_loc(key) + except KeyError: + return self._engine.get_loc(self._maybe_cast_indexer(key)) + indexer = self.get_indexer([key], method=method, tolerance=tolerance) + if indexer.ndim > 1 or indexer.size > 1: + raise TypeError('get_loc requires scalar valued input') + loc = indexer.item() + if loc == -1: + raise KeyError(key) + return loc - def _format_with_header(self, header, na_rep='NaN', **kwargs): - values = self.values + _index_shared_docs['get_indexer'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. - from pandas.io.formats.format import format_array + Parameters + ---------- + target : %(target_klass)s + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for + inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - if is_categorical_dtype(values.dtype): - values = np.array(values) - - elif is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, safe=1) + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. - if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] + .. versionadded:: 0.21.0 (list-like tolerance) - # could have nans - mask = isna(values) - if mask.any(): - result = np.array(result) - result[mask] = na_rep - result = result.tolist() + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. - else: - result = _trim_front(format_array(values, None, justify='left')) - return header + result + Examples + -------- + >>> index = pd.Index(['c', 'a', 'b']) + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) - def to_native_types(self, slicer=None, **kwargs): + Notice that the return value is an array of locations in ``index`` + and ``x`` is marked by -1, as it is not in ``index``. """ - Format specified values of `self` and return them. - Parameters - ---------- - slicer : int, array-like - An indexer into `self` that specifies which values - are used in the formatting process. - kwargs : dict - Options for specifying how the values should be formatted. - These options include the following: + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + method = missing.clean_reindex_fill_method(method) + target = ensure_index(target) + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) - 1) na_rep : str - The value that serves as a placeholder for NULL values - 2) quoting : bool or None - Whether or not there are quoted values in `self` - 3) date_format : str - The format used to represent date-like values - """ + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + if target.is_boolean() and self.is_numeric(): + return ensure_platform_int(np.repeat(-1, target.size)) - values = self - if slicer is not None: - values = values[slicer] - return values._format_native_types(**kwargs) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer(ptarget, method=method, limit=limit, + tolerance=tolerance) - def _format_native_types(self, na_rep='', quoting=None, **kwargs): - """ actually format my specific types """ - mask = isna(self) - if not self.is_object() and not quoting: - values = np.asarray(self).astype(str) + if not is_dtype_equal(self.dtype, target.dtype): + this = self.astype(object) + target = target.astype(object) + return this.get_indexer(target, method=method, limit=limit, + tolerance=tolerance) + + if not self.is_unique: + raise InvalidIndexError('Reindexing only valid with uniquely' + ' valued Index objects') + + if method == 'pad' or method == 'backfill': + indexer = self._get_fill_indexer(target, method, limit, tolerance) + elif method == 'nearest': + indexer = self._get_nearest_indexer(target, limit, tolerance) else: - values = np.array(self, dtype=object, copy=True) + if tolerance is not None: + raise ValueError('tolerance argument only valid if doing pad, ' + 'backfill or nearest reindexing') + if limit is not None: + raise ValueError('limit argument only valid if doing pad, ' + 'backfill or nearest reindexing') - values[mask] = na_rep - return values + indexer = self._engine.get_indexer(target._ndarray_values) - def equals(self, other): + return ensure_platform_int(indexer) + + def _convert_tolerance(self, tolerance, target): + # override this method on subclasses + tolerance = np.asarray(tolerance) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError('list-like tolerance size must match ' + 'target index size') + return tolerance + + def _get_fill_indexer(self, target, method, limit=None, tolerance=None): + if self.is_monotonic_increasing and target.is_monotonic_increasing: + method = (self._engine.get_pad_indexer if method == 'pad' else + self._engine.get_backfill_indexer) + indexer = method(target._ndarray_values, limit) + else: + indexer = self._get_fill_indexer_searchsorted(target, method, + limit) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, + tolerance) + return indexer + + def _get_fill_indexer_searchsorted(self, target, method, limit=None): """ - Determines if two Index objects contain the same elements. + Fallback pad/backfill get_indexer that works for monotonic decreasing + indexes and non-monotonic targets. """ - if self.is_(other): - return True - - if not isinstance(other, Index): - return False + if limit is not None: + raise ValueError('limit argument for %r method only well-defined ' + 'if index and target are monotonic' % method) - if is_object_dtype(self) and not is_object_dtype(other): - # if other is not object, use other's logic for coercion - return other.equals(self) + side = 'left' if method == 'pad' else 'right' - try: - return array_equivalent(com.values_from_object(self), - com.values_from_object(other)) - except Exception: - return False + # find exact matches first (this simplifies the algorithm) + indexer = self.get_indexer(target) + nonexact = (indexer == -1) + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], + side) + if side == 'left': + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + indexer[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + indexer[indexer == len(self)] = -1 + return indexer - def identical(self, other): - """Similar to equals, but check that other comparable attributes are - also equal + def _get_nearest_indexer(self, target, limit, tolerance): """ - return (self.equals(other) and - all((getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables)) and - type(self) == type(other)) - - def asof(self, label): + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other (e.g., not strings or + tuples). """ - Return the label from the index, or, if not present, the previous one. + left_indexer = self.get_indexer(target, 'pad', limit=limit) + right_indexer = self.get_indexer(target, 'backfill', limit=limit) - Assuming that the index is sorted, return the passed index label if it - is in the index, or return the previous index label if the passed one - is not in the index. + target = np.asarray(target) + left_distances = abs(self.values[left_indexer] - target) + right_distances = abs(self.values[right_indexer] - target) - Parameters - ---------- - label : object - The label up to which the method returns the latest index label. + op = operator.lt if self.is_monotonic_increasing else operator.le + indexer = np.where(op(left_distances, right_distances) | + (right_indexer == -1), left_indexer, right_indexer) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target, indexer, + tolerance) + return indexer - Returns - ------- - object - The passed label if it is in the index. The previous label if the - passed label is not in the sorted index or `NaN` if there is no - such label. + def _filter_indexer_tolerance(self, target, indexer, tolerance): + distance = abs(self.values[indexer] - target) + indexer = np.where(distance <= tolerance, indexer, -1) + return indexer - See Also - -------- - Series.asof : Return the latest value in a Series up to the - passed index. - merge_asof : Perform an asof merge (similar to left join but it - matches on nearest key rather than equal key). - Index.get_loc : `asof` is a thin wrapper around `get_loc` - with method='pad'. + # -------------------------------------------------------------------- + # Indexer Conversion Methods - Examples - -------- - `Index.asof` returns the latest index label up to the passed label. + _index_shared_docs['_convert_scalar_indexer'] = """ + Convert a scalar indexer. - >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) - >>> idx.asof('2014-01-01') - '2013-12-31' + Parameters + ---------- + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - If the label is in the index, the method returns the passed label. + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - >>> idx.asof('2014-01-02') - '2014-01-02' + if kind == 'iloc': + return self._validate_indexer('positional', key, kind) - If all of the labels in the index are later than the passed label, - NaN is returned. + if len(self) and not isinstance(self, ABCMultiIndex,): - >>> idx.asof('1999-01-02') - nan + # we can raise here if we are definitive that this + # is positional indexing (eg. .ix on with a float) + # or label indexing if we are using a type able + # to be represented in the index - If the index is not sorted, an error is raised. + if kind in ['getitem', 'ix'] and is_float(key): + if not self.is_floating(): + return self._invalid_indexer('label', key) - >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', - ... '2014-01-03']) - >>> idx_not_sorted.asof('2013-12-31') - Traceback (most recent call last): - ValueError: index must be monotonic increasing or decreasing - """ - try: - loc = self.get_loc(label, method='pad') - except KeyError: - return self._na_value - else: - if isinstance(loc, slice): - loc = loc.indices(len(self))[-1] - return self[loc] - - def asof_locs(self, where, mask): - """ - where : array of timestamps - mask : array of booleans where data is not NA - - """ - locs = self.values[mask].searchsorted(where.values, side='right') + elif kind in ['loc'] and is_float(key): - locs = np.where(locs > 0, locs - 1, 0) - result = np.arange(len(self))[mask].take(locs) + # we want to raise KeyError on string/mixed here + # technically we *could* raise a TypeError + # on anything but mixed though + if self.inferred_type not in ['floating', + 'mixed-integer-float', + 'string', + 'unicode', + 'mixed']: + return self._invalid_indexer('label', key) - first = mask.argmax() - result[(locs == 0) & (where.values < self.values[first])] = -1 + elif kind in ['loc'] and is_integer(key): + if not self.holds_integer(): + return self._invalid_indexer('label', key) - return result + return key - def sort_values(self, return_indexer=False, ascending=True): - """ - Return a sorted copy of the index. + _index_shared_docs['_convert_slice_indexer'] = """ + Convert a slice indexer. - Return a sorted copy of the index, and optionally return the indices - that sorted the index itself. + By definition, these are labels unless 'iloc' is passed in. + Floats are not allowed as the start, step, or stop of the slice. Parameters ---------- - return_indexer : bool, default False - Should the indices that would sort the index be returned. - ascending : bool, default True - Should the index values be sorted in an ascending order. + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - Returns - ------- - sorted_index : pandas.Index - Sorted copy of the index. - indexer : numpy.ndarray, optional - The indices that the index itself was sorted by. + @Appender(_index_shared_docs['_convert_slice_indexer']) + def _convert_slice_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - See Also - -------- - pandas.Series.sort_values : Sort values of a Series. - pandas.DataFrame.sort_values : Sort values in a DataFrame. + # if we are not a slice, then we are done + if not isinstance(key, slice): + return key - Examples - -------- - >>> idx = pd.Index([10, 100, 1, 1000]) - >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') + # validate iloc + if kind == 'iloc': + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - Sort values in ascending order (default behavior). + # potentially cast the bounds to integers + start, stop, step = key.start, key.stop, key.step - >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') + # figure out if this is a positional indexer + def is_int(v): + return v is None or is_integer(v) - Sort values in descending order, and also get the indices `idx` was - sorted by. + is_null_slicer = start is None and stop is None + is_index_slice = is_int(start) and is_int(stop) + is_positional = is_index_slice and not self.is_integer() - >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) - """ - _as = self.argsort() - if not ascending: - _as = _as[::-1] + if kind == 'getitem': + """ + called from the getitem slicers, validate that we are in fact + integers + """ + if self.is_integer() or is_index_slice: + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - sorted_index = self.take(_as) + # convert the slice to an indexer here - if return_indexer: - return sorted_index, _as + # if we are mixed and have integers + try: + if is_positional and self.is_mixed(): + # Validate start & stop + if start is not None: + self.get_loc(start) + if stop is not None: + self.get_loc(stop) + is_positional = False + except KeyError: + if self.inferred_type == 'mixed-integer-float': + raise + + if is_null_slicer: + indexer = key + elif is_positional: + indexer = key else: - return sorted_index + try: + indexer = self.slice_indexer(start, stop, step, kind=kind) + except Exception: + if is_index_slice: + if self.is_integer(): + raise + else: + indexer = key + else: + raise - def sort(self, *args, **kwargs): - raise TypeError("cannot sort an Index object in-place, use " - "sort_values instead") + return indexer - def sortlevel(self, level=None, ascending=True, sort_remaining=None): + def _convert_listlike_indexer(self, keyarr, kind=None): """ - - For internal compatibility with with the Index API - - Sort the Index. This is for compat with MultiIndex - Parameters ---------- - ascending : boolean, default True - False to sort in descending order - - level, sort_remaining are compat parameters + keyarr : list-like + Indexer to convert. Returns ------- - sorted_index : Index + tuple (indexer, keyarr) + indexer is an ndarray or None if cannot convert + keyarr are tuple-safe keys """ - return self.sort_values(return_indexer=True, ascending=ascending) + if isinstance(keyarr, Index): + keyarr = self._convert_index_indexer(keyarr) + else: + keyarr = self._convert_arr_indexer(keyarr) - def shift(self, periods=1, freq=None): - """ - Shift index by desired number of time frequency increments. + indexer = self._convert_list_indexer(keyarr, kind=kind) + return indexer, keyarr - This method is for shifting the values of datetime-like indexes - by a specified time increment a given number of times. + _index_shared_docs['_convert_arr_indexer'] = """ + Convert an array-like indexer to the appropriate dtype. Parameters ---------- - periods : int, default 1 - Number of periods (or increments) to shift by, - can be positive or negative. - freq : pandas.DateOffset, pandas.Timedelta or string, optional - Frequency increment to shift by. - If None, the index is shifted by its own `freq` attribute. - Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + keyarr : array-like + Indexer to convert. Returns ------- - pandas.Index - shifted index - - See Also - -------- - Series.shift : Shift values of Series. - - Examples - -------- - Put the first 5 month starts of 2011 into an index. - - >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') - >>> month_starts - DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', - '2011-05-01'], - dtype='datetime64[ns]', freq='MS') + converted_keyarr : array-like + """ - Shift the index by 10 days. + @Appender(_index_shared_docs['_convert_arr_indexer']) + def _convert_arr_indexer(self, keyarr): + keyarr = com.asarray_tuplesafe(keyarr) + return keyarr - >>> month_starts.shift(10, freq='D') - DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', - '2011-05-11'], - dtype='datetime64[ns]', freq=None) + _index_shared_docs['_convert_index_indexer'] = """ + Convert an Index indexer to the appropriate dtype. - The default value of `freq` is the `freq` attribute of the index, - which is 'MS' (month start) in this example. + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. - >>> month_starts.shift(10) - DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', - '2012-03-01'], - dtype='datetime64[ns]', freq='MS') + Returns + ------- + converted_keyarr : Index (or sub-class) + """ - Notes - ----- - This method is only implemented for datetime-like index classes, - i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. - """ - raise NotImplementedError("Not supported for type %s" % - type(self).__name__) + @Appender(_index_shared_docs['_convert_index_indexer']) + def _convert_index_indexer(self, keyarr): + return keyarr - def argsort(self, *args, **kwargs): - """ - Return the integer indices that would sort the index. + _index_shared_docs['_convert_list_indexer'] = """ + Convert a list-like indexer to the appropriate dtype. Parameters ---------- - *args - Passed to `numpy.ndarray.argsort`. - **kwargs - Passed to `numpy.ndarray.argsort`. + keyarr : Index (or sub-class) + Indexer to convert. + kind : iloc, ix, loc, optional Returns ------- - numpy.ndarray - Integer indices that would sort the index if used as - an indexer. + positional indexer or None + """ - See also - -------- - numpy.argsort : Similar method for NumPy arrays. - Index.sort_values : Return sorted copy of Index. + @Appender(_index_shared_docs['_convert_list_indexer']) + def _convert_list_indexer(self, keyarr, kind=None): + if (kind in [None, 'iloc', 'ix'] and + is_integer_dtype(keyarr) and not self.is_floating() and + not isinstance(keyarr, ABCPeriodIndex)): - Examples - -------- - >>> idx = pd.Index(['b', 'a', 'd', 'c']) - >>> idx - Index(['b', 'a', 'd', 'c'], dtype='object') + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + # missing values are flagged as -1 by get_indexer and negative + # indices are already converted to positive indices in the + # above if-statement, so the negative flags are changed to + # values outside the range of indices so as to trigger an + # IndexError in maybe_convert_indices + indexer[indexer < 0] = len(self) + from pandas.core.indexing import maybe_convert_indices + return maybe_convert_indices(indexer, len(self)) - >>> order = idx.argsort() - >>> order - array([1, 0, 3, 2]) + elif not self.inferred_type == 'integer': + keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) + return keyarr - >>> idx[order] - Index(['a', 'b', 'c', 'd'], dtype='object') + return None + + def _invalid_indexer(self, form, key): """ - result = self.asi8 - if result is None: - result = np.array(self) - return result.argsort(*args, **kwargs) + Consistent invalid indexer message. + """ + raise TypeError("cannot do {form} indexing on {klass} with these " + "indexers [{key}] of {kind}".format( + form=form, klass=type(self), key=key, + kind=type(key))) - def __add__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - return Index(np.array(self) + other) + # -------------------------------------------------------------------- + # Reindex Methods - def __radd__(self, other): - return Index(other + np.array(self)) + def _can_reindex(self, indexer): + """ + Check if we are allowing reindexing with this particular indexer. - def __iadd__(self, other): - # alias for __add__ - return self + other + Parameters + ---------- + indexer : an integer indexer - def __sub__(self, other): - return Index(np.array(self) - other) + Raises + ------ + ValueError if its a duplicate axis + """ - def __rsub__(self, other): - return Index(other - np.array(self)) + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") - def __and__(self, other): - return self.intersection(other) + def reindex(self, target, method=None, level=None, limit=None, + tolerance=None): + """ + Create index with target's values (move/add/delete values + as necessary). - def __or__(self, other): - return self.union(other) + Parameters + ---------- + target : an iterable - def __xor__(self, other): - return self.symmetric_difference(other) + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index - def _get_consensus_name(self, other): - """ - Given 2 indexes, give a consensus name meaning - we take the not None one, or None if the names differ. - Return a new object if we are resetting the name """ - if self.name != other.name: - if self.name is None or other.name is None: - name = self.name or other.name + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, 'name') + + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = _ensure_has_len(target) # target may be an iterator + + if not isinstance(target, Index) and len(target) == 0: + attrs = self._get_attributes_dict() + attrs.pop('freq', None) # don't preserve freq + values = self._data[:0] # appropriately-dtyped empty array + target = self._simple_new(values, dtype=self.dtype, **attrs) + else: + target = ensure_index(target) + + if level is not None: + if method is not None: + raise TypeError('Fill method not supported if level passed') + _, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) + else: + if self.equals(target): + indexer = None else: - name = None - if self.name != name: - return self._shallow_copy(name=name) - return self - def union(self, other): + if self.is_unique: + indexer = self.get_indexer(target, method=method, + limit=limit, + tolerance=tolerance) + else: + if method is not None or limit is not None: + raise ValueError("cannot reindex a non-unique index " + "with a method or limit") + indexer, missing = self.get_indexer_non_unique(target) + + if preserve_names and target.nlevels == 1 and target.name != self.name: + target = target.copy() + target.name = self.name + + return target, indexer + + def _reindex_non_unique(self, target): """ - Form the union of two Index objects and sorts if possible. + Create a new index with target's values (move/add/delete values as + necessary) use with non-unique Index and a possibly non-unique target. Parameters ---------- - other : Index or array-like + target : an iterable Returns ------- - union : Index - - Examples - -------- - - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.union(idx2) - Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index """ - self._assert_can_do_setop(other) - other = ensure_index(other) - if len(other) == 0 or self.equals(other): - return self._get_consensus_name(other) - - if len(self) == 0: - return other._get_consensus_name(self) - - # TODO: is_dtype_union_equal is a hack around - # 1. buggy set ops with duplicates (GH #13432) - # 2. CategoricalIndex lacking setops (GH #10186) - # Once those are fixed, this workaround can be removed - if not is_dtype_union_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.union(other) + target = ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self) or is_datetime64tz_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other) or is_datetime64tz_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + if len(missing): + length = np.arange(len(indexer)) - if self.is_monotonic and other.is_monotonic: - try: - result = self._outer_indexer(lvals, rvals)[0] - except TypeError: - # incomparable objects - result = list(lvals) + missing = ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = ensure_int64(length[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = ensure_int64(length[check]) - # worth making this faster? a very unusual case - value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) - else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels - if len(indexer) > 0: - other_diff = algos.take_nd(rvals, indexer, - allow_fill=False) - result = _concat._concat_compat((lvals, other_diff)) + # a unique indexer + if target.is_unique: - try: - lvals[0] < other_diff[0] - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) - else: - types = frozenset((self.inferred_type, - other.inferred_type)) - if not types & _unsortable_types: - result.sort() + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 + # we have a non_unique selector, need to use the original + # indexer here else: - result = lvals - try: - result = np.sort(result) - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) + # need to retake to have the same size as the indexer + indexer[~check] = -1 - # for subclasses - return self._wrap_union_result(other, result) + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None - return self.__class__(result, name=name) + new_index = self._shallow_copy_with_infer(new_labels, freq=None) + return new_index, indexer, new_indexer - def intersection(self, other): - """ - Form the intersection of two Index objects. + # -------------------------------------------------------------------- + # Join Methods - This returns a new Index with elements common to the index and `other`, - preserving the order of the calling index. + _index_shared_docs['join'] = """ + Compute join_index and indexers to conform data + structures to the new index. Parameters ---------- - other : Index or array-like + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + sort : boolean, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword) + + .. versionadded:: 0.20.0 Returns ------- - intersection : Index - - Examples - -------- - - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.intersection(idx2) - Int64Index([3, 4], dtype='int64') - + join_index, (left_indexer, right_indexer) """ - self._assert_can_do_setop(other) - other = ensure_index(other) - - if self.equals(other): - return self._get_consensus_name(other) - if not is_dtype_equal(self.dtype, other.dtype): + @Appender(_index_shared_docs['join']) + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + from .multi import MultiIndex + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + # try to figure out the join level + # GH3662 + if level is None and (self_is_mi or other_is_mi): + + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how, + return_indexers=return_indexers) + + # join on the level + if level is not None and (self_is_mi or other_is_mi): + return self._join_level(other, level, how=how, + return_indexers=return_indexers) + + other = ensure_index(other) + + if len(other) == 0 and how in ('left', 'outer'): + join_index = self._shallow_copy() + if return_indexers: + rindexer = np.repeat(-1, len(join_index)) + return join_index, None, rindexer + else: + return join_index + + if len(self) == 0 and how in ('right', 'outer'): + join_index = other._shallow_copy() + if return_indexers: + lindexer = np.repeat(-1, len(join_index)) + return join_index, lindexer, None + else: + return join_index + + if self._join_precedence < other._join_precedence: + how = {'right': 'left', 'left': 'right'}.get(how, how) + result = other.join(self, how=how, level=level, + return_indexers=return_indexers) + if return_indexers: + x, y, z = result + result = x, z, y + return result + + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') - return this.intersection(other) + return this.join(other, how=how, return_indexers=return_indexers) - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + _validate_join_method(how) - if self.is_monotonic and other.is_monotonic: + if not self.is_unique and not other.is_unique: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + else: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_union_result(other, result) + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) except TypeError: pass - try: - indexer = Index(rvals).get_indexer(lvals) - indexer = indexer.take((indexer != -1).nonzero()[0]) - except Exception: - # duplicates - indexer = algos.unique1d( - Index(rvals).get_indexer_non_unique(lvals)[0]) - indexer = indexer[indexer != -1] - - taken = other.take(indexer) - if self.name != other.name: - taken.name = None - return taken + if how == 'left': + join_index = self + elif how == 'right': + join_index = other + elif how == 'inner': + join_index = self.intersection(other) + elif how == 'outer': + join_index = self.union(other) - def difference(self, other): - """ - Return a new Index with elements from the index that are not in - `other`. + if sort: + join_index = join_index.sort_values() - This is the set difference of two Index objects. - It's sorted if sorting is possible. + if return_indexers: + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer(join_index) + return join_index, lindexer, rindexer + else: + return join_index - Parameters - ---------- - other : Index or array-like + def _join_multi(self, other, how, return_indexers=True): + from .multi import MultiIndex + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin - Returns - ------- - difference : Index + # figure out join names + self_names = set(com._not_none(*self.names)) + other_names = set(com._not_none(*other.names)) + overlap = self_names & other_names - Examples - -------- + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) - """ - self._assert_can_do_setop(other) + if self_is_mi and other_is_mi: - if self.equals(other): - return self._shallow_copy([]) + # Drop the non-matching levels from left and right respectively + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) - other, result_name = self._convert_can_do_setop(other) + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) - this = self._get_unique_index() + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) - indexer = this.get_indexer(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) - the_diff = this.values.take(label_diff) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + levels, codes, names = ( + _restore_dropped_levels_multijoin(self, other, + dropped_names, + join_idx, + lidx, ridx)) - return this._shallow_copy(the_diff, name=result_name, freq=None) + # Re-create the multi-index + multi_join_idx = MultiIndex(levels=levels, codes=codes, + names=names, verify_integrity=False) - def symmetric_difference(self, other, result_name=None): - """ - Compute the symmetric difference of two Index objects. - It's sorted if sorting is possible. + multi_join_idx = multi_join_idx.remove_unused_levels() - Parameters - ---------- - other : Index or array-like - result_name : str + return multi_join_idx, lidx, ridx - Returns - ------- - symmetric_difference : Index + jl = list(overlap)[0] - Notes - ----- - ``symmetric_difference`` contains elements that appear in either - ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by - ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates - dropped. + # Case where only one index is multi + # make the indices into mi's that match + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) - Examples - -------- - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([2, 3, 4, 5]) - >>> idx1.symmetric_difference(idx2) - Int64Index([1, 5], dtype='int64') + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) - You can also use the ``^`` operator: + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result - >>> idx1 ^ idx2 - Int64Index([1, 5], dtype='int64') - """ - self._assert_can_do_setop(other) - other, result_name_update = self._convert_can_do_setop(other) - if result_name is None: - result_name = result_name_update + def _join_non_unique(self, other, how='left', return_indexers=False): + from pandas.core.reshape.merge import _get_join_indexers - this = self._get_unique_index() - other = other._get_unique_index() - indexer = this.get_indexer(other) + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, + sort=True) - # {this} minus {other} - common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, - assume_unique=True) - left_diff = this.values.take(left_indexer) + left_idx = ensure_platform_int(left_idx) + right_idx = ensure_platform_int(right_idx) - # {other} minus {this} - right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + join_index = np.asarray(self._ndarray_values.take(left_idx)) + mask = left_idx == -1 + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) - the_diff = _concat._concat_compat([left_diff, right_diff]) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + join_index = self._wrap_joined_index(join_index, other) - attribs = self._get_attributes_dict() - attribs['name'] = result_name - if 'freq' in attribs: - attribs['freq'] = None - return self._shallow_copy_with_infer(the_diff, **attribs) + if return_indexers: + return join_index, left_idx, right_idx + else: + return join_index - def _get_unique_index(self, dropna=False): + def _join_level(self, other, level, how='left', return_indexers=False, + keep_order=True): """ - Returns an index containing unique values. - - Parameters - ---------- - dropna : bool - If True, NaN values are dropped. + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. - Returns - ------- - uniques : index + If ```keep_order == True```, the order of the data indexed by the + MultiIndex will not be changed; otherwise, it will tie out + with `other`. """ - if self.is_unique and not dropna: - return self + from .multi import MultiIndex - values = self.values + def _get_leaf_sorter(labels): + """ + Returns sorter for the inner most level while preserving the + order of higher levels. + """ + if labels[0].size == 0: + return np.empty(0, dtype='int64') - if not self.is_unique: - values = self.unique() + if len(labels) == 1: + lab = ensure_int64(labels[0]) + sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) + return sorter - if dropna: - try: - if self.hasnans: - values = values[~isna(values)] - except NotImplementedError: - pass + # find indexers of beginning of each set of + # same-key labels w.r.t all but last level + tic = labels[0][:-1] != labels[0][1:] + for lab in labels[1:-1]: + tic |= lab[:-1] != lab[1:] - return self._shallow_copy(values) + starts = np.hstack(([True], tic, [True])).nonzero()[0] + lab = ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, ensure_int64(starts)) - _index_shared_docs['get_loc'] = """ - Get integer location, slice or boolean mask for requested label. + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise TypeError('Join on level between two MultiIndex objects ' + 'is ambiguous') - Parameters - ---------- - key : label - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - tolerance : optional - Maximum distance from index value for inexact matches. The value of - the index at the matching location most satisfy the equation - ``abs(index[loc] - key) <= tolerance``. + left, right = self, other - Tolerance may be a scalar - value, which applies the same tolerance to all values, or - list-like, which applies variable tolerance per element. List-like - includes list, tuple, array, Series, and must be the same size as - the index and its dtype must exactly match the index's type. + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + how = {'right': 'left', 'left': 'right'}.get(how, how) - .. versionadded:: 0.21.0 (list-like tolerance) + level = left._get_level_number(level) + old_level = left.levels[level] - Returns - ------- - loc : int if unique index, slice if monotonic index, else mask + if not right.is_unique: + raise NotImplementedError('Index._join_level on non-unique index ' + 'is not implemented') - Examples - --------- - >>> unique_index = pd.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 + new_level, left_lev_indexer, right_lev_indexer = \ + old_level.join(right, how=how, return_indexers=True) - >>> monotonic_index = pd.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) + if left_lev_indexer is None: + if keep_order or len(left) == 0: + left_indexer = None + join_index = left + else: # sort the leaves + left_indexer = _get_leaf_sorter(left.codes[:level + 1]) + join_index = left[left_indexer] - >>> non_monotonic_index = pd.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True], dtype=bool) - """ + else: + left_lev_indexer = ensure_int64(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, + len(old_level)) - @Appender(_index_shared_docs['get_loc']) - def get_loc(self, key, method=None, tolerance=None): - if method is None: - if tolerance is not None: - raise ValueError('tolerance argument only valid if using pad, ' - 'backfill or nearest lookups') - try: - return self._engine.get_loc(key) - except KeyError: - return self._engine.get_loc(self._maybe_cast_indexer(key)) - indexer = self.get_indexer([key], method=method, tolerance=tolerance) - if indexer.ndim > 1 or indexer.size > 1: - raise TypeError('get_loc requires scalar valued input') - loc = indexer.item() - if loc == -1: - raise KeyError(key) - return loc + new_lev_codes = algos.take_nd(rev_indexer, left.codes[level], + allow_fill=False) - def get_value(self, series, key): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing - """ + new_codes = list(left.codes) + new_codes[level] = new_lev_codes - # if we have something that is Index-like, then - # use this, e.g. DatetimeIndex - s = getattr(series, '_values', None) - if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): - # GH 20882, 21257 - # Unify Index and ExtensionArray treatment - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if (len(self) > 0 - and (self.holds_integer() or self.is_boolean())): - raise - elif is_integer(key): - return s[key] + new_levels = list(left.levels) + new_levels[level] = new_level - s = com.values_from_object(series) - k = com.values_from_object(key) + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left), dtype=np.intp) + mask = new_lev_codes != -1 + if not mask.all(): + new_codes = [lab[mask] for lab in new_codes] + left_indexer = left_indexer[mask] - k = self._convert_scalar_indexer(k, kind='getitem') - try: - return self._engine.get_value(s, k, - tz=getattr(series.dtype, 'tz', None)) - except KeyError as e1: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + ngroups = 1 + new_lev_codes.max() + left_indexer, counts = libalgos.groupsort_indexer( + new_lev_codes, ngroups) - try: - return libindex.get_value_box(s, key) - except IndexError: - raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: # pragma: no cover - raise e1 - except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover - raise IndexError(key) - raise InvalidIndexError(key) + # missing values are placed first; drop them! + left_indexer = left_indexer[counts[0]:] + new_codes = [lab[left_indexer] for lab in new_codes] - def set_value(self, arr, key, value): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing - """ - self._engine.set_value(com.values_from_object(arr), - com.values_from_object(key), value) + else: # sort the leaves + mask = new_lev_codes != -1 + mask_all = mask.all() + if not mask_all: + new_codes = [lab[mask] for lab in new_codes] - def _get_level_values(self, level): - """ - Return an Index of values for requested level. + left_indexer = _get_leaf_sorter(new_codes[:level + 1]) + new_codes = [lab[left_indexer] for lab in new_codes] - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. + # left_indexers are w.r.t masked frame. + # reverse to original frame! + if not mask_all: + left_indexer = mask.nonzero()[0][left_indexer] - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. + join_index = MultiIndex(levels=new_levels, codes=new_codes, + names=left.names, verify_integrity=False) - Returns - ------- - values : Index - Calling object, as there is only one level in the Index. + if right_lev_indexer is not None: + right_indexer = algos.take_nd(right_lev_indexer, + join_index.codes[level], + allow_fill=False) + else: + right_indexer = join_index.codes[level] - See also - -------- - MultiIndex.get_level_values : get values for a level of a MultiIndex + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer - Notes - ----- - For Index, level should be 0, since there are no multiple levels. + if return_indexers: + left_indexer = (None if left_indexer is None + else ensure_platform_int(left_indexer)) + right_indexer = (None if right_indexer is None + else ensure_platform_int(right_indexer)) + return join_index, left_indexer, right_indexer + else: + return join_index - Examples - -------- + def _join_monotonic(self, other, how='left', return_indexers=False): + if self.equals(other): + ret_index = other if how == 'right' else self + if return_indexers: + return ret_index, None, None + else: + return ret_index - >>> idx = pd.Index(list('abc')) - >>> idx - Index(['a', 'b', 'c'], dtype='object') + sv = self._ndarray_values + ov = other._ndarray_values - Get level values by supplying `level` as integer: + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == 'left': + join_index = self + lidx = None + ridx = self._left_indexer_unique(sv, ov) + elif how == 'right': + join_index = other + lidx = self._left_indexer_unique(ov, sv) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + else: + if how == 'left': + join_index, lidx, ridx = self._left_indexer(sv, ov) + elif how == 'right': + join_index, ridx, lidx = self._left_indexer(ov, sv) + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') - """ - self._validate_index_level(level) - return self + if return_indexers: + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + else: + return join_index - get_level_values = _get_level_values + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Index(joined, name=name) - def droplevel(self, level=0): + # -------------------------------------------------------------------- + # Uncategorized Methods + + @property + def values(self): """ - Return index with requested level(s) removed. If resulting index has - only 1 level left, the result will be of Index type, not MultiIndex. + Return an array representing the data in the Index. - .. versionadded:: 0.23.1 (support for non-MultiIndex) + .. warning:: - Parameters - ---------- - level : int, str, or list-like, default 0 - If a string is given, must be the name of a level - If list-like, elements must be names or indexes of levels. + We recommend using :attr:`Index.array` or + :meth:`Index.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. Returns ------- - index : Index or MultiIndex - """ - if not isinstance(level, (tuple, list)): - level = [level] - - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - - if len(level) == 0: - return self - if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) - # The two checks above guarantee that here self is a MultiIndex + array: numpy.ndarray or ExtensionArray - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) + See Also + -------- + Index.array : Reference to the underlying data. + Index.to_numpy : A NumPy array representing the underlying data. - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) + Return the underlying data as an ndarray. + """ + return self._data.view(np.ndarray) - if len(new_levels) == 1: + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index, np.ndarray] + # TODO(EA): remove index types as they become extension arrays + """ + The best array representation. - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. - result.name = new_names[0] - return result - else: - from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. - _index_shared_docs['get_indexer'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. + It may differ from the public '.values' method. - Parameters - ---------- - target : %(target_klass)s - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - limit : int, optional - Maximum number of consecutive labels in ``target`` to match for - inexact matches. - tolerance : optional - Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most - satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------- | --------------- | + Index | ndarray | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | ndarray[int] | + DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | - Tolerance may be a scalar value, which applies the same tolerance - to all values, or list-like, which applies variable tolerance per - element. List-like includes list, tuple, array, Series, and must be - the same size as the index and its dtype must exactly match the - index's type. + See Also + -------- + values + _ndarray_values + """ + return self._data - .. versionadded:: 0.21.0 (list-like tolerance) + def get_values(self): + """ + Return `Index` data as an `numpy.ndarray`. Returns ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. + numpy.ndarray + A one-dimensional numpy array of the `Index` values. + + See Also + -------- + Index.values : The attribute that get_values wraps. Examples -------- - >>> index = pd.Index(['c', 'a', 'b']) - >>> index.get_indexer(['a', 'b', 'x']) - array([ 1, 2, -1]) + Getting the `Index` values of a `DataFrame`: - Notice that the return value is an array of locations in ``index`` - and ``x`` is marked by -1, as it is not in ``index``. + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) + >>> df + A B C + a 1 2 3 + b 4 5 6 + c 7 8 9 + >>> df.index.get_values() + array(['a', 'b', 'c'], dtype=object) + + Standalone `Index` values: + + >>> idx = pd.Index(['1', '2', '3']) + >>> idx.get_values() + array(['1', '2', '3'], dtype=object) + + `MultiIndex` arrays also have only one dimension: + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], + ... names=('number', 'letter')) + >>> midx.get_values() + array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) + >>> midx.get_values().ndim + 1 """ + return self.values - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): - method = missing.clean_reindex_fill_method(method) - target = ensure_index(target) - if tolerance is not None: - tolerance = self._convert_tolerance(tolerance, target) + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) - # Treat boolean labels passed to a numeric index as not found. Without - # this fix False and True would be treated as 0 and 1 respectively. - # (GH #16877) - if target.is_boolean() and self.is_numeric(): - return ensure_platform_int(np.repeat(-1, target.size)) + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer(ptarget, method=method, limit=limit, - tolerance=tolerance) + _index_shared_docs['where'] = """ + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. - if not is_dtype_equal(self.dtype, target.dtype): - this = self.astype(object) - target = target.astype(object) - return this.get_indexer(target, method=method, limit=limit, - tolerance=tolerance) + .. versionadded:: 0.19.0 - if not self.is_unique: - raise InvalidIndexError('Reindexing only valid with uniquely' - ' valued Index objects') + Parameters + ---------- + cond : boolean array-like with the same length as self + other : scalar, or array-like + """ - if method == 'pad' or method == 'backfill': - indexer = self._get_fill_indexer(target, method, limit, tolerance) - elif method == 'nearest': - indexer = self._get_nearest_indexer(target, limit, tolerance) - else: - if tolerance is not None: - raise ValueError('tolerance argument only valid if doing pad, ' - 'backfill or nearest reindexing') - if limit is not None: - raise ValueError('limit argument only valid if doing pad, ' - 'backfill or nearest reindexing') + @Appender(_index_shared_docs['where']) + def where(self, cond, other=None): + if other is None: + other = self._na_value - indexer = self._engine.get_indexer(target._ndarray_values) + dtype = self.dtype + values = self.values - return ensure_platform_int(indexer) + if is_bool(other) or is_bool_dtype(other): - def _convert_tolerance(self, tolerance, target): - # override this method on subclasses - tolerance = np.asarray(tolerance) - if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') - return tolerance + # bools force casting + values = values.astype(object) + dtype = None - def _get_fill_indexer(self, target, method, limit=None, tolerance=None): - if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = (self._engine.get_pad_indexer if method == 'pad' else - self._engine.get_backfill_indexer) - indexer = method(target._ndarray_values, limit) - else: - indexer = self._get_fill_indexer_searchsorted(target, method, - limit) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._ndarray_values, - indexer, - tolerance) - return indexer + values = np.where(cond, values, other) - def _get_fill_indexer_searchsorted(self, target, method, limit=None): - """ - Fallback pad/backfill get_indexer that works for monotonic decreasing - indexes and non-monotonic targets + if self._is_numeric_dtype and np.any(isna(values)): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None + + return self._shallow_copy_with_infer(values, dtype=dtype) + + # construction helpers + @classmethod + def _try_convert_to_int_index(cls, data, copy, name, dtype): """ - if limit is not None: - raise ValueError('limit argument for %r method only well-defined ' - 'if index and target are monotonic' % method) + Attempt to convert an array of data into an integer index. - side = 'left' if method == 'pad' else 'right' + Parameters + ---------- + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. - # find exact matches first (this simplifies the algorithm) - indexer = self.get_indexer(target) - nonexact = (indexer == -1) - indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], - side) - if side == 'left': - # searchsorted returns "indices into a sorted array such that, - # if the corresponding elements in v were inserted before the - # indices, the order of a would be preserved". - # Thus, we need to subtract 1 to find values to the left. - indexer[nonexact] -= 1 - # This also mapped not found values (values of 0 from - # np.searchsorted) to -1, which conveniently is also our - # sentinel for missing values - else: - # Mark indices to the right of the largest value as not found - indexer[indexer == len(self)] = -1 - return indexer + Returns + ------- + int_index : data converted to either an Int64Index or a + UInt64Index - def _get_nearest_indexer(self, target, limit, tolerance): - """ - Get the indexer for the nearest index labels; requires an index with - values that can be subtracted from each other (e.g., not strings or - tuples). + Raises + ------ + ValueError if the conversion was not successful. """ - left_indexer = self.get_indexer(target, 'pad', limit=limit) - right_indexer = self.get_indexer(target, 'backfill', limit=limit) - target = np.asarray(target) - left_distances = abs(self.values[left_indexer] - target) - right_distances = abs(self.values[right_indexer] - target) + from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - op = operator.lt if self.is_monotonic_increasing else operator.le - indexer = np.where(op(left_distances, right_distances) | - (right_indexer == -1), left_indexer, right_indexer) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, - tolerance) - return indexer + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - def _filter_indexer_tolerance(self, target, indexer, tolerance): - distance = abs(self.values[indexer] - target) - indexer = np.where(distance <= tolerance, indexer, -1) - return indexer + raise ValueError - _index_shared_docs['get_indexer_non_unique'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. + @classmethod + def _scalar_data_error(cls, data): + raise TypeError('{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, + repr(data))) + + @classmethod + def _string_data_error(cls, data): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to a numeric type') + + @classmethod + def _coerce_to_ndarray(cls, data): + """ + Coerces data to ndarray. + + Converts other iterables to list first and then to array. + Does not touch ndarrays. + + Raises + ------ + TypeError + When the data passed in is a scalar. + """ + + if not isinstance(data, (np.ndarray, Index)): + if data is None or is_scalar(data): + cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + data = np.asarray(data) + return data + + def _coerce_scalar_to_index(self, item): + """ + We need to coerce a scalar to a compat for our index type. Parameters ---------- - target : %(target_klass)s - - Returns - ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. - missing : ndarray of int - An indexer into the target of the values not found. - These correspond to the -1 in the indexer array + item : scalar item to coerce """ + dtype = self.dtype - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = ensure_index(target) - if is_categorical(target): - target = target.astype(target.dtype.categories.dtype) - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer_non_unique(ptarget) + if self._is_numeric_dtype and isna(item): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None - if self.is_all_dates: - self = Index(self.asi8) - tgt_values = target.asi8 - else: - tgt_values = target._ndarray_values + return Index([item], dtype=dtype, **self._get_attributes_dict()) - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - return ensure_platform_int(indexer), missing + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self - def get_indexer_for(self, target, **kwargs): + def _convert_for_op(self, value): """ - guaranteed return of an indexer even when non-unique - This dispatches to get_indexer or get_indexer_nonunique as appropriate + Convert value to be insertable to ndarray. """ - if self.is_unique: - return self.get_indexer(target, **kwargs) - indexer, _ = self.get_indexer_non_unique(target, **kwargs) - return indexer + return value - def _maybe_promote(self, other): - # A hack, but it works - from pandas import DatetimeIndex - if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): - return DatetimeIndex(self), other - elif self.inferred_type == 'boolean': - if not is_object_dtype(self.dtype): - return self.astype('object'), other.astype('object') - return self, other + def _assert_can_do_op(self, value): + """ + Check value is valid for scalar op. + """ + if not is_scalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) - def groupby(self, values): + @property + def _has_complex_internals(self): + # to disable groupby tricks in MultiIndex + return False + + def _is_memory_usage_qualified(self): """ - Group the index labels by a given array of values. + Return a boolean if we need a qualified .info display. + """ + return self.is_object() + + def is_type_compatible(self, kind): + return kind == self.inferred_type + + _index_shared_docs['__contains__'] = """ + Return a boolean if this key is IN the index. Parameters ---------- - values : array - Values used to determine the groups. + key : object Returns ------- - groups : dict - {group name -> group labels} + boolean """ - # TODO: if we are a MultiIndex, we can do better - # that converting to tuples - from .multi import MultiIndex - if isinstance(values, MultiIndex): - values = values.values - values = ensure_categorical(values) - result = values._reverse_indexer() - - # map to the label - result = {k: self.take(v) for k, v in compat.iteritems(result)} - - return result + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) + def __contains__(self, key): + hash(key) + try: + return key in self._engine + except (OverflowError, TypeError, ValueError): + return False - def map(self, mapper, na_action=None): - """ - Map values using input correspondence (a dict, Series, or function). + _index_shared_docs['contains'] = """ + Return a boolean if this key is IN the index. Parameters ---------- - mapper : function, dict, or Series - Mapping correspondence. - na_action : {None, 'ignore'} - If 'ignore', propagate NA values, without passing them to the - mapping correspondence. + key : object Returns ------- - applied : Union[Index, MultiIndex], inferred - The output of the mapping function applied to the index. - If the function returns a tuple with more than one element - a MultiIndex will be returned. + boolean """ - from .multi import MultiIndex - new_values = super(Index, self)._map_values( - mapper, na_action=na_action) - - attributes = self._get_attributes_dict() - - # we can return a MultiIndex - if new_values.size and isinstance(new_values[0], tuple): - if isinstance(self, MultiIndex): - names = self.names - elif attributes.get('name'): - names = [attributes.get('name')] * len(new_values[0]) - else: - names = None - return MultiIndex.from_tuples(new_values, - names=names) + @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + def contains(self, key): + hash(key) + try: + return key in self._engine + except (TypeError, ValueError): + return False - attributes['copy'] = False - if not new_values.size: - # empty - attributes['dtype'] = self.dtype + def __hash__(self): + raise TypeError("unhashable type: %r" % type(self).__name__) - return Index(new_values, **attributes) + def __setitem__(self, key, value): + raise TypeError("Index does not support mutable operations") - def isin(self, values, level=None): + def __getitem__(self, key): """ - Return a boolean array where the index values are in `values`. - - Compute boolean array of whether each index value is found in the - passed set of values. The length of the returned boolean array matches - the length of the index. + Override numpy.ndarray's __getitem__ method to work as desired. - Parameters - ---------- - values : set or list-like - Sought values. + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). - .. versionadded:: 0.18.1 + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. - Support for values as a set. + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + getitem = self._data.__getitem__ + promote = self._shallow_copy - level : str or int, optional - Name or position of the index level to use (if the index is a - `MultiIndex`). + if is_scalar(key): + key = com.cast_scalar_indexer(key) + return getitem(key) - Returns - ------- - is_contained : ndarray - NumPy array of boolean values. + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return promote(getitem(key)) - See also - -------- - Series.isin : Same for Series. - DataFrame.isin : Same method for DataFrames. + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) - Notes - ----- - In the case of `MultiIndex` you must either specify `values` as a - list-like object containing tuples that are the same length as the - number of levels, or specify `level`. Otherwise it will raise a - ``ValueError``. + key = com.values_from_object(key) + result = getitem(key) + if not is_scalar(result): + return promote(result) + else: + return result - If `level` is specified: + def _can_hold_identifiers_and_holds_name(self, name): + """ + Faster check for ``name in self`` when we know `name` is a Python + identifier (e.g. in NDFrame.__getattr__, which hits this to support + . key lookup). For indexes that can't hold identifiers (everything + but object & categorical) we just return False. - - if it is the name of one *and only one* index level, use that level; - - otherwise it should be a number indicating level position. + https://github.com/pandas-dev/pandas/issues/19764 + """ + if self.is_object() or self.is_categorical(): + return name in self + return False - Examples - -------- - >>> idx = pd.Index([1,2,3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') + def append(self, other): + """ + Append a collection of Index options together. - Check whether each index value in a list of values. - >>> idx.isin([1, 4]) - array([ True, False, False]) + Parameters + ---------- + other : Index or list/tuple of indices - >>> midx = pd.MultiIndex.from_arrays([[1,2,3], - ... ['red', 'blue', 'green']], - ... names=('number', 'color')) - >>> midx - MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], - labels=[[0, 1, 2], [2, 0, 1]], - names=['number', 'color']) + Returns + ------- + appended : Index + """ - Check whether the strings in the 'color' level of the MultiIndex - are in a list of colors. + to_concat = [self] - >>> midx.isin(['red', 'orange', 'yellow'], level='color') - array([ True, False, False]) + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) - To check across the levels of a MultiIndex, pass a list of tuples: + for obj in to_concat: + if not isinstance(obj, Index): + raise TypeError('all inputs must be Index') - >>> midx.isin([(1, 'red'), (3, 'red')]) - array([ True, False, False]) + names = {obj.name for obj in to_concat} + name = None if len(names) > 1 else self.name - For a DatetimeIndex, string values in `values` are converted to - Timestamps. + return self._concat(to_concat, name) - >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] - >>> dti = pd.to_datetime(dates) - >>> dti - DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], - dtype='datetime64[ns]', freq=None) + def _concat(self, to_concat, name): - >>> dti.isin(['2000-03-11']) - array([ True, False, False]) + typs = _concat.get_dtype_kinds(to_concat) + + if len(typs) == 1: + return self._concat_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) + + def _concat_same_dtype(self, to_concat, name): """ - if level is not None: - self._validate_index_level(level) - return algos.isin(self, values) + Concatenate to_concat which has the same class. + """ + # must be overridden in specific classes + return _concat._concat_index_asobject(to_concat, name) - def _can_reindex(self, indexer): + def putmask(self, mask, value): """ - *this is an internal non-public method* + Return a new Index of the values set with the mask. - Check if we are allowing reindexing with this particular indexer + See Also + -------- + numpy.ndarray.putmask + """ + values = self.values.copy() + try: + np.putmask(values, mask, self._convert_for_op(value)) + return self._shallow_copy(values) + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err - Parameters - ---------- - indexer : an integer indexer + # coerces to object + return self.astype(object).putmask(mask, value) - Raises - ------ - ValueError if its a duplicate axis + def equals(self, other): """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True - # trying to reindex on an axis with duplicates - if not self.is_unique and len(indexer): - raise ValueError("cannot reindex from a duplicate axis") + if not isinstance(other, Index): + return False - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + if is_object_dtype(self) and not is_object_dtype(other): + # if other is not object, use other's logic for coercion + return other.equals(self) + + try: + return array_equivalent(com.values_from_object(self), + com.values_from_object(other)) + except Exception: + return False + + def identical(self, other): + """ + Similar to equals, but check that other comparable attributes are + also equal. + """ + return (self.equals(other) and + all((getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables)) and + type(self) == type(other)) + + def asof(self, label): """ - Create index with target's values (move/add/delete values as necessary) + Return the label from the index, or, if not present, the previous one. + + Assuming that the index is sorted, return the passed index label if it + is in the index, or return the previous index label if the passed one + is not in the index. Parameters ---------- - target : an iterable + label : object + The label up to which the method returns the latest index label. Returns ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index + object + The passed label if it is in the index. The previous label if the + passed label is not in the sorted index or `NaN` if there is no + such label. - """ - # GH6552: preserve names when reindexing to non-named target - # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'name') + See Also + -------- + Series.asof : Return the latest value in a Series up to the + passed index. + merge_asof : Perform an asof merge (similar to left join but it + matches on nearest key rather than equal key). + Index.get_loc : An `asof` is a thin wrapper around `get_loc` + with method='pad'. - # GH7774: preserve dtype/tz if target is empty and not an Index. - target = _ensure_has_len(target) # target may be an iterator + Examples + -------- + `Index.asof` returns the latest index label up to the passed label. - if not isinstance(target, Index) and len(target) == 0: - attrs = self._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq - target = self._simple_new(None, dtype=self.dtype, **attrs) - else: - target = ensure_index(target) + >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) + >>> idx.asof('2014-01-01') + '2013-12-31' - if level is not None: - if method is not None: - raise TypeError('Fill method not supported if level passed') - _, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True) - else: - if self.equals(target): - indexer = None - else: + If the label is in the index, the method returns the passed label. - if self.is_unique: - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) - else: - if method is not None or limit is not None: - raise ValueError("cannot reindex a non-unique index " - "with a method or limit") - indexer, missing = self.get_indexer_non_unique(target) + >>> idx.asof('2014-01-02') + '2014-01-02' - if preserve_names and target.nlevels == 1 and target.name != self.name: - target = target.copy() - target.name = self.name + If all of the labels in the index are later than the passed label, + NaN is returned. - return target, indexer + >>> idx.asof('1999-01-02') + nan - def _reindex_non_unique(self, target): + If the index is not sorted, an error is raised. + + >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', + ... '2014-01-03']) + >>> idx_not_sorted.asof('2013-12-31') + Traceback (most recent call last): + ValueError: index must be monotonic increasing or decreasing """ - *this is an internal non-public method* + try: + loc = self.get_loc(label, method='pad') + except KeyError: + return self._na_value + else: + if isinstance(loc, slice): + loc = loc.indices(len(self))[-1] + return self[loc] - Create a new index with target's values (move/add/delete values as - necessary) use with non-unique Index and a possibly non-unique target + def asof_locs(self, where, mask): + """ + Finds the locations (indices) of the labels from the index for + every entry in the `where` argument. + + As in the `asof` function, if the label (a particular entry in + `where`) is not in the index, the latest index label upto the + passed label is chosen and its index returned. + + If all of the labels in the index are later than a label in `where`, + -1 is returned. + + `mask` is used to ignore NA values in the index during calculation. Parameters ---------- - target : an iterable + where : Index + An Index consisting of an array of timestamps. + mask : array-like + Array of booleans denoting where values in the original + data are not NA. Returns ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index - + numpy.ndarray + An array of locations (indices) of the labels from the Index + which correspond to the return values of the `asof` function + for every element in `where`. """ + locs = self.values[mask].searchsorted(where.values, side='right') + locs = np.where(locs > 0, locs - 1, 0) - target = ensure_index(target) - indexer, missing = self.get_indexer_non_unique(target) - check = indexer != -1 - new_labels = self.take(indexer[check]) - new_indexer = None + result = np.arange(len(self))[mask].take(locs) - if len(missing): - length = np.arange(len(indexer)) + first = mask.argmax() + result[(locs == 0) & (where.values < self.values[first])] = -1 - missing = ensure_platform_int(missing) - missing_labels = target.take(missing) - missing_indexer = ensure_int64(length[~check]) - cur_labels = self.take(indexer[check]).values - cur_indexer = ensure_int64(length[check]) + return result - new_labels = np.empty(tuple([len(indexer)]), dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels + def sort_values(self, return_indexer=False, ascending=True): + """ + Return a sorted copy of the index. - # a unique indexer - if target.is_unique: + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. - # see GH5553, make sure we use the right indexer - new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange(len(cur_labels)) - new_indexer[missing_indexer] = -1 + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. - # we have a non_unique selector, need to use the original - # indexer here - else: + Returns + ------- + sorted_index : pandas.Index + Sorted copy of the index. + indexer : numpy.ndarray, optional + The indices that the index itself was sorted by. - # need to retake to have the same size as the indexer - indexer[~check] = -1 + See Also + -------- + pandas.Series.sort_values : Sort values of a Series. + pandas.DataFrame.sort_values : Sort values in a DataFrame. - # reset the new indexer to account for the new size - new_indexer = np.arange(len(self.take(indexer))) - new_indexer[~check] = -1 + Examples + -------- + >>> idx = pd.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') - new_index = self._shallow_copy_with_infer(new_labels, freq=None) - return new_index, indexer, new_indexer + Sort values in ascending order (default behavior). - _index_shared_docs['join'] = """ - *this is an internal non-public method* + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') - Compute join_index and indexers to conform data - structures to the new index. + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + """ + _as = self.argsort() + if not ascending: + _as = _as[::-1] + + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + + def sort(self, *args, **kwargs): + raise TypeError("cannot sort an Index object in-place, use " + "sort_values instead") + + def shift(self, periods=1, freq=None): + """ + Shift index by desired number of time frequency increments. + + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. Parameters ---------- - other : Index - how : {'left', 'right', 'inner', 'outer'} - level : int or level name, default None - return_indexers : boolean, default False - sort : boolean, default False - Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword) - - .. versionadded:: 0.20.0 + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. Returns ------- - join_index, (left_indexer, right_indexer) - """ + pandas.Index + shifted index - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + See Also + -------- + Series.shift : Shift values of Series. - # try to figure out the join level - # GH3662 - if level is None and (self_is_mi or other_is_mi): + Examples + -------- + Put the first 5 month starts of 2011 into an index. - # have the same levels/names so a simple join - if self.names == other.names: - pass - else: - return self._join_multi(other, how=how, - return_indexers=return_indexers) + >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts + DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', + '2011-05-01'], + dtype='datetime64[ns]', freq='MS') - # join on the level - if level is not None and (self_is_mi or other_is_mi): - return self._join_level(other, level, how=how, - return_indexers=return_indexers) + Shift the index by 10 days. - other = ensure_index(other) + >>> month_starts.shift(10, freq='D') + DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', + '2011-05-11'], + dtype='datetime64[ns]', freq=None) - if len(other) == 0 and how in ('left', 'outer'): - join_index = self._shallow_copy() - if return_indexers: - rindexer = np.repeat(-1, len(join_index)) - return join_index, None, rindexer - else: - return join_index + The default value of `freq` is the `freq` attribute of the index, + which is 'MS' (month start) in this example. - if len(self) == 0 and how in ('right', 'outer'): - join_index = other._shallow_copy() - if return_indexers: - lindexer = np.repeat(-1, len(join_index)) - return join_index, lindexer, None - else: - return join_index + >>> month_starts.shift(10) + DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', + '2012-03-01'], + dtype='datetime64[ns]', freq='MS') - if self._join_precedence < other._join_precedence: - how = {'right': 'left', 'left': 'right'}.get(how, how) - result = other.join(self, how=how, level=level, - return_indexers=return_indexers) - if return_indexers: - x, y, z = result - result = x, z, y - return result + Notes + ----- + This method is only implemented for datetime-like index classes, + i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. + """ + raise NotImplementedError("Not supported for type %s" % + type(self).__name__) - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.join(other, how=how, return_indexers=return_indexers) + def argsort(self, *args, **kwargs): + """ + Return the integer indices that would sort the index. - _validate_join_method(how) + Parameters + ---------- + *args + Passed to `numpy.ndarray.argsort`. + **kwargs + Passed to `numpy.ndarray.argsort`. - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic and other.is_monotonic: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - else: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif self.is_monotonic and other.is_monotonic: - try: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - except TypeError: - pass + Returns + ------- + numpy.ndarray + Integer indices that would sort the index if used as + an indexer. - if how == 'left': - join_index = self - elif how == 'right': - join_index = other - elif how == 'inner': - join_index = self.intersection(other) - elif how == 'outer': - join_index = self.union(other) + See Also + -------- + numpy.argsort : Similar method for NumPy arrays. + Index.sort_values : Return sorted copy of Index. - if sort: - join_index = join_index.sort_values() + Examples + -------- + >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx + Index(['b', 'a', 'd', 'c'], dtype='object') - if return_indexers: - if join_index is self: - lindexer = None - else: - lindexer = self.get_indexer(join_index) - if join_index is other: - rindexer = None - else: - rindexer = other.get_indexer(join_index) - return join_index, lindexer, rindexer - else: - return join_index + >>> order = idx.argsort() + >>> order + array([1, 0, 3, 2]) - def _join_multi(self, other, how, return_indexers=True): - from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + >>> idx[order] + Index(['a', 'b', 'c', 'd'], dtype='object') + """ + result = self.asi8 + if result is None: + result = np.array(self) + return result.argsort(*args, **kwargs) - # figure out join names - self_names = com._not_none(*self.names) - other_names = com._not_none(*other.names) - overlap = list(set(self_names) & set(other_names)) - - # need at least 1 in common, but not more than 1 - if not len(overlap): - raise ValueError("cannot join with no level specified and no " - "overlapping names") - if len(overlap) > 1: - raise NotImplementedError("merging with more than one level " - "overlap on a multi-index is not " - "implemented") - jl = overlap[0] + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing. + """ - # make the indices into mi's that match - if not (self_is_mi and other_is_mi): - - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) - - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) - - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + # if we have something that is Index-like, then + # use this, e.g. DatetimeIndex + s = getattr(series, '_values', None) + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): + # GH 20882, 21257 + # Unify Index and ExtensionArray treatment + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if (len(self) > 0 and + (self.holds_integer() or self.is_boolean())): + raise + elif is_integer(key): + return s[key] - # 2 multi-indexes - raise NotImplementedError("merging with both multi-indexes is not " - "implemented") + s = com.values_from_object(series) + k = com.values_from_object(key) - def _join_non_unique(self, other, how='left', return_indexers=False): - from pandas.core.reshape.merge import _get_join_indexers + k = self._convert_scalar_indexer(k, kind='getitem') + try: + return self._engine.get_value(s, k, + tz=getattr(series.dtype, 'tz', None)) + except KeyError as e1: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise - left_idx, right_idx = _get_join_indexers([self._ndarray_values], - [other._ndarray_values], - how=how, - sort=True) + try: + return libindex.get_value_box(s, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + # python 3 + if is_scalar(key): # pragma: no cover + raise IndexError(key) + raise InvalidIndexError(key) - left_idx = ensure_platform_int(left_idx) - right_idx = ensure_platform_int(right_idx) + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. - join_index = np.asarray(self._ndarray_values.take(left_idx)) - mask = left_idx == -1 - np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + Notes + ----- + Only use this if you know what you're doing. + """ + self._engine.set_value(com.values_from_object(arr), + com.values_from_object(key), value) + + _index_shared_docs['get_indexer_non_unique'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : ndarray of int + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array + """ - join_index = self._wrap_joined_index(join_index, other) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) - if return_indexers: - return join_index, left_idx, right_idx + if self.is_all_dates: + self = Index(self.asi8) + tgt_values = target.asi8 else: - return join_index + tgt_values = target._ndarray_values - def _join_level(self, other, level, how='left', return_indexers=False, - keep_order=True): + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing + + def get_indexer_for(self, target, **kwargs): """ - The join method *only* affects the level of the resulting - MultiIndex. Otherwise it just exactly aligns the Index data to the - labels of the level in the MultiIndex. If `keep_order` == True, the - order of the data indexed by the MultiIndex will not be changed; - otherwise, it will tie out with `other`. + Guaranteed return of an indexer even when non-unique. + + This dispatches to get_indexer or get_indexer_nonunique + as appropriate. """ - from .multi import MultiIndex + if self.is_unique: + return self.get_indexer(target, **kwargs) + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer - def _get_leaf_sorter(labels): - """ - returns sorter for the inner most level while preserving the - order of higher levels - """ - if labels[0].size == 0: - return np.empty(0, dtype='int64') + def _maybe_promote(self, other): + # A hack, but it works + from pandas import DatetimeIndex + if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + return DatetimeIndex(self), other + elif self.inferred_type == 'boolean': + if not is_object_dtype(self.dtype): + return self.astype('object'), other.astype('object') + return self, other - if len(labels) == 1: - lab = ensure_int64(labels[0]) - sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) - return sorter + def groupby(self, values): + """ + Group the index labels by a given array of values. - # find indexers of beginning of each set of - # same-key labels w.r.t all but last level - tic = labels[0][:-1] != labels[0][1:] - for lab in labels[1:-1]: - tic |= lab[:-1] != lab[1:] + Parameters + ---------- + values : array + Values used to determine the groups. - starts = np.hstack(([True], tic, [True])).nonzero()[0] - lab = ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, ensure_int64(starts)) + Returns + ------- + groups : dict + {group name -> group labels} + """ - if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): - raise TypeError('Join on level between two MultiIndex objects ' - 'is ambiguous') + # TODO: if we are a MultiIndex, we can do better + # that converting to tuples + from .multi import MultiIndex + if isinstance(values, MultiIndex): + values = values.values + values = ensure_categorical(values) + result = values._reverse_indexer() - left, right = self, other + # map to the label + result = {k: self.take(v) for k, v in compat.iteritems(result)} - flip_order = not isinstance(self, MultiIndex) - if flip_order: - left, right = right, left - how = {'right': 'left', 'left': 'right'}.get(how, how) + return result - level = left._get_level_number(level) - old_level = left.levels[level] + def map(self, mapper, na_action=None): + """ + Map values using input correspondence (a dict, Series, or function). - if not right.is_unique: - raise NotImplementedError('Index._join_level on non-unique index ' - 'is not implemented') + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. - new_level, left_lev_indexer, right_lev_indexer = \ - old_level.join(right, how=how, return_indexers=True) + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ - if left_lev_indexer is None: - if keep_order or len(left) == 0: - left_indexer = None - join_index = left - else: # sort the leaves - left_indexer = _get_leaf_sorter(left.labels[:level + 1]) - join_index = left[left_indexer] + from .multi import MultiIndex + new_values = super(Index, self)._map_values( + mapper, na_action=na_action) - else: - left_lev_indexer = ensure_int64(left_lev_indexer) - rev_indexer = lib.get_reverse_indexer(left_lev_indexer, - len(old_level)) + attributes = self._get_attributes_dict() - new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], - allow_fill=False) + # we can return a MultiIndex + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get('name'): + names = [attributes.get('name')] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, + names=names) - new_labels = list(left.labels) - new_labels[level] = new_lev_labels + attributes['copy'] = False + if not new_values.size: + # empty + attributes['dtype'] = self.dtype - new_levels = list(left.levels) - new_levels[level] = new_level + return Index(new_values, **attributes) - if keep_order: # just drop missing values. o.w. keep order - left_indexer = np.arange(len(left), dtype=np.intp) - mask = new_lev_labels != -1 - if not mask.all(): - new_labels = [lab[mask] for lab in new_labels] - left_indexer = left_indexer[mask] + def isin(self, values, level=None): + """ + Return a boolean array where the index values are in `values`. - else: # tie out the order with other - if level == 0: # outer most level, take the fast route - ngroups = 1 + new_lev_labels.max() - left_indexer, counts = libalgos.groupsort_indexer( - new_lev_labels, ngroups) + Compute boolean array of whether each index value is found in the + passed set of values. The length of the returned boolean array matches + the length of the index. - # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0]:] - new_labels = [lab[left_indexer] for lab in new_labels] + Parameters + ---------- + values : set or list-like + Sought values. - else: # sort the leaves - mask = new_lev_labels != -1 - mask_all = mask.all() - if not mask_all: - new_labels = [lab[mask] for lab in new_labels] + .. versionadded:: 0.18.1 - left_indexer = _get_leaf_sorter(new_labels[:level + 1]) - new_labels = [lab[left_indexer] for lab in new_labels] + Support for values as a set. - # left_indexers are w.r.t masked frame. - # reverse to original frame! - if not mask_all: - left_indexer = mask.nonzero()[0][left_indexer] + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). - join_index = MultiIndex(levels=new_levels, labels=new_labels, - names=left.names, verify_integrity=False) + Returns + ------- + is_contained : ndarray + NumPy array of boolean values. - if right_lev_indexer is not None: - right_indexer = algos.take_nd(right_lev_indexer, - join_index.labels[level], - allow_fill=False) - else: - right_indexer = join_index.labels[level] + See Also + -------- + Series.isin : Same for Series. + DataFrame.isin : Same method for DataFrames. - if flip_order: - left_indexer, right_indexer = right_indexer, left_indexer + Notes + ----- + In the case of `MultiIndex` you must either specify `values` as a + list-like object containing tuples that are the same length as the + number of levels, or specify `level`. Otherwise it will raise a + ``ValueError``. - if return_indexers: - left_indexer = (None if left_indexer is None - else ensure_platform_int(left_indexer)) - right_indexer = (None if right_indexer is None - else ensure_platform_int(right_indexer)) - return join_index, left_indexer, right_indexer - else: - return join_index + If `level` is specified: - def _join_monotonic(self, other, how='left', return_indexers=False): - if self.equals(other): - ret_index = other if how == 'right' else self - if return_indexers: - return ret_index, None, None - else: - return ret_index + - if it is the name of one *and only one* index level, use that level; + - otherwise it should be a number indicating level position. - sv = self._ndarray_values - ov = other._ndarray_values + Examples + -------- + >>> idx = pd.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') - if self.is_unique and other.is_unique: - # We can perform much better than the general case - if how == 'left': - join_index = self - lidx = None - ridx = self._left_indexer_unique(sv, ov) - elif how == 'right': - join_index = other - lidx = self._left_indexer_unique(ov, sv) - ridx = None - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - else: - if how == 'left': - join_index, lidx, ridx = self._left_indexer(sv, ov) - elif how == 'right': - join_index, ridx, lidx = self._left_indexer(ov, sv) - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) + Check whether each index value in a list of values. + >>> idx.isin([1, 4]) + array([ True, False, False]) + + >>> midx = pd.MultiIndex.from_arrays([[1,2,3], + ... ['red', 'blue', 'green']], + ... names=('number', 'color')) + >>> midx + MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], + labels=[[0, 1, 2], [2, 0, 1]], + names=['number', 'color']) - if return_indexers: - lidx = None if lidx is None else ensure_platform_int(lidx) - ridx = None if ridx is None else ensure_platform_int(ridx) - return join_index, lidx, ridx - else: - return join_index + Check whether the strings in the 'color' level of the MultiIndex + are in a list of colors. - def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None - return Index(joined, name=name) + >>> midx.isin(['red', 'orange', 'yellow'], level='color') + array([ True, False, False]) + + To check across the levels of a MultiIndex, pass a list of tuples: + + >>> midx.isin([(1, 'red'), (3, 'red')]) + array([ True, False, False]) + + For a DatetimeIndex, string values in `values` are converted to + Timestamps. + + >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] + >>> dti = pd.to_datetime(dates) + >>> dti + DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], + dtype='datetime64[ns]', freq=None) + + >>> dti.isin(['2000-03-11']) + array([ True, False, False]) + """ + if level is not None: + self._validate_index_level(level) + return algos.isin(self, values) def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing, @@ -4211,8 +4603,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): def _maybe_cast_indexer(self, key): """ - If we have a float key and are not a floating index - then try to cast to an int if equivalent + If we have a float key and are not a floating index, then try to cast + to an int if equivalent. """ if is_float(key) and not self.is_floating(): @@ -4226,9 +4618,8 @@ def _maybe_cast_indexer(self, key): def _validate_indexer(self, form, key, kind): """ - if we are positional indexer - validate that we have appropriate typed bounds - must be an integer + If we are positional indexer, validate that we have appropriate + typed bounds must be an integer. """ assert kind in ['ix', 'loc', 'getitem', 'iloc'] @@ -4313,7 +4704,6 @@ def get_slice_bound(self, label, side, kind): label : object side : {'left', 'right'} kind : {'ix', 'loc', 'getitem'} - """ assert kind in ['ix', 'loc', 'getitem', None] @@ -4390,7 +4780,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): See Also -------- - Index.get_loc : Get location for a single label + Index.get_loc : Get location for a single label. """ inc = (step is None or step >= 0) @@ -4439,7 +4829,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def delete(self, loc): """ - Make new Index with passed location(-s) deleted + Make new Index with passed location(-s) deleted. Returns ------- @@ -4449,8 +4839,9 @@ def delete(self, loc): def insert(self, loc, item): """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values + Make new Index inserting new item at location. + + Follows Python list.append semantics for negative values. Parameters ---------- @@ -4468,7 +4859,7 @@ def insert(self, loc, item): def drop(self, labels, errors='raise'): """ - Make new Index with passed list of labels deleted + Make new Index with passed list of labels deleted. Parameters ---------- @@ -4496,194 +4887,19 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) - _index_shared_docs['index_unique'] = ( - """ - Return unique values in the index. Uniques are returned in order - of appearance, this does NOT sort. - - Parameters - ---------- - level : int or str, optional, default None - Only return values from specified level (for MultiIndex) - - .. versionadded:: 0.23.0 - - Returns - ------- - Index without duplicates - - See Also - -------- - unique - Series.unique - """) - - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) - result = super(Index, self).unique() - return self._shallow_copy(result) - - def drop_duplicates(self, keep='first'): - """ - Return Index with duplicate values removed. - - Parameters - ---------- - keep : {'first', 'last', ``False``}, default 'first' - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - - Returns - ------- - deduplicated : Index - - See Also - -------- - Series.drop_duplicates : equivalent method on Series - DataFrame.drop_duplicates : equivalent method on DataFrame - Index.duplicated : related method on Index, indicating duplicate - Index values. - - Examples - -------- - Generate an pandas.Index with duplicate values. - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - - The `keep` parameter controls which duplicate values are removed. - The value 'first' keeps the first occurrence for each - set of duplicated entries. The default value of keep is 'first'. - - >>> idx.drop_duplicates(keep='first') - Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') - - The value 'last' keeps the last occurrence for each set of duplicated - entries. - - >>> idx.drop_duplicates(keep='last') - Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') - - The value ``False`` discards all sets of duplicated entries. - - >>> idx.drop_duplicates(keep=False) - Index(['cow', 'beetle', 'hippo'], dtype='object') - """ - return super(Index, self).drop_duplicates(keep=keep) - - def duplicated(self, keep='first'): - """ - Indicate duplicate index values. - - Duplicated values are indicated as ``True`` values in the resulting - array. Either all duplicates, all except the first, or all except the - last occurrence of duplicates can be indicated. - - Parameters - ---------- - keep : {'first', 'last', False}, default 'first' - The value or values in a set of duplicates to mark as missing. - - - 'first' : Mark duplicates as ``True`` except for the first - occurrence. - - 'last' : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. - - Examples - -------- - By default, for each set of duplicated values, the first occurrence is - set to False and all others to True: - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) - >>> idx.duplicated() - array([False, False, True, False, True]) - - which is equivalent to - - >>> idx.duplicated(keep='first') - array([False, False, True, False, True]) - - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: - - >>> idx.duplicated(keep='last') - array([ True, False, True, False, False]) - - By setting keep on ``False``, all duplicates are True: - - >>> idx.duplicated(keep=False) - array([ True, False, True, False, True]) - - Returns - ------- - numpy.ndarray - - See Also - -------- - pandas.Series.duplicated : Equivalent method on pandas.Series - pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame - pandas.Index.drop_duplicates : Remove duplicate values from Index - """ - return super(Index, self).duplicated(keep=keep) - - _index_shared_docs['fillna'] = """ - Fill NA/NaN values with the specified value - - Parameters - ---------- - value : scalar - Scalar value to use to fill holes (e.g. 0). - This value cannot be a list-likes. - downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) - - Returns - ------- - filled : %(klass)s - """ - - @Appender(_index_shared_docs['fillna']) - def fillna(self, value=None, downcast=None): - self._assert_can_do_op(value) - if self.hasnans: - result = self.putmask(self._isnan, value) - if downcast is None: - # no need to care metadata other than name - # because it can't have freq if - return Index(result, name=self.name) - return self._shallow_copy() - - _index_shared_docs['dropna'] = """ - Return Index without NA/NaN values - - Parameters - ---------- - how : {'any', 'all'}, default 'any' - If the Index is a MultiIndex, drop the value when any or all levels - are NaN. - - Returns - ------- - valid : Index - """ - - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): - raise ValueError("invalid how option: {0}".format(how)) - - if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) - return self._shallow_copy() + # -------------------------------------------------------------------- + # Generated Arithmetic, Comparison, and Unary Methods def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that # operation and then wrap the results + if self._is_numeric_dtype and op.__name__ in ['add', 'sub', + 'radd', 'rsub']: + raise TypeError("Operation {opname} between {cls} and {other} " + "is invalid".format(opname=op.__name__, + cls=self.dtype, + other=type(other).__name__)) + other = Timedelta(other) values = self.values @@ -4701,7 +4917,9 @@ def _evaluate_with_datetime_like(self, other, op): @classmethod def _add_comparison_methods(cls): - """ add in comparison methods """ + """ + Add in comparison methods. + """ cls.__eq__ = _make_comparison_op(operator.eq, cls) cls.__ne__ = _make_comparison_op(operator.ne, cls) cls.__lt__ = _make_comparison_op(operator.lt, cls) @@ -4711,7 +4929,9 @@ def _add_comparison_methods(cls): @classmethod def _add_numeric_methods_add_sub_disabled(cls): - """ add in the numeric add/sub methods to disable """ + """ + Add in the numeric add/sub methods to disable. + """ cls.__add__ = make_invalid_op('__add__') cls.__radd__ = make_invalid_op('__radd__') cls.__iadd__ = make_invalid_op('__iadd__') @@ -4721,7 +4941,9 @@ def _add_numeric_methods_add_sub_disabled(cls): @classmethod def _add_numeric_methods_disabled(cls): - """ add in numeric methods to disable other than add/sub """ + """ + Add in numeric methods to disable other than add/sub. + """ cls.__pow__ = make_invalid_op('__pow__') cls.__rpow__ = make_invalid_op('__rpow__') cls.__mul__ = make_invalid_op('__mul__') @@ -4741,12 +4963,15 @@ def _add_numeric_methods_disabled(cls): cls.__inv__ = make_invalid_op('__inv__') def _maybe_update_attributes(self, attrs): - """ Update Index attributes (e.g. freq) depending on op """ + """ + Update Index attributes (e.g. freq) depending on op. + """ return attrs def _validate_for_numeric_unaryop(self, op, opstr): - """ validate if we can perform a numeric unary operation """ - + """ + Validate if we can perform a numeric unary operation. + """ if not self._is_numeric_dtype: raise TypeError("cannot evaluate a numeric op " "{opstr} for type: {typ}" @@ -4754,10 +4979,12 @@ def _validate_for_numeric_unaryop(self, op, opstr): def _validate_for_numeric_binop(self, other, op): """ - return valid other, evaluate or raise TypeError - if we are not of the appropriate type + Return valid other; evaluate or raise TypeError if we are not of + the appropriate type. - internal method called by ops + Notes + ----- + This is an internal method called by ops. """ opstr = '__{opname}__'.format(opname=op.__name__) # if we are an inheritor of numeric, @@ -4797,30 +5024,35 @@ def _validate_for_numeric_binop(self, other, op): @classmethod def _add_numeric_methods_binary(cls): - """ add in numeric methods """ + """ + Add in numeric methods. + """ cls.__add__ = _make_arithmetic_op(operator.add, cls) cls.__radd__ = _make_arithmetic_op(ops.radd, cls) cls.__sub__ = _make_arithmetic_op(operator.sub, cls) cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls) - cls.__mul__ = _make_arithmetic_op(operator.mul, cls) - cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls) cls.__pow__ = _make_arithmetic_op(operator.pow, cls) - cls.__mod__ = _make_arithmetic_op(operator.mod, cls) - cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) - cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) + cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) if not compat.PY3: cls.__div__ = _make_arithmetic_op(operator.div, cls) cls.__rdiv__ = _make_arithmetic_op(ops.rdiv, cls) + # TODO: rmod? rdivmod? + cls.__mod__ = _make_arithmetic_op(operator.mod, cls) + cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) + cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) cls.__divmod__ = _make_arithmetic_op(divmod, cls) + cls.__mul__ = _make_arithmetic_op(operator.mul, cls) + cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) @classmethod def _add_numeric_methods_unary(cls): - """ add in numeric unary methods """ - + """ + Add in numeric unary methods. + """ def _make_evaluate_unary(op, opstr): def _evaluate_numeric_unary(self): @@ -4844,8 +5076,9 @@ def _add_numeric_methods(cls): @classmethod def _add_logical_methods(cls): - """ add in logical methods """ - + """ + Add in logical methods. + """ _doc = """ %(desc)s @@ -4949,7 +5182,9 @@ def logical_func(self, *args, **kwargs): @classmethod def _add_logical_methods_disabled(cls): - """ add in logical methods to disable """ + """ + Add in logical methods to disable. + """ cls.all = make_invalid_op('all') cls.any = make_invalid_op('any') @@ -4960,7 +5195,8 @@ def _add_logical_methods_disabled(cls): def ensure_index_from_sequences(sequences, names=None): - """Construct an index from sequences of data. + """ + Construct an index from sequences of data. A single sequence returns an Index. Many sequences returns a MultiIndex. @@ -5001,7 +5237,7 @@ def ensure_index_from_sequences(sequences, names=None): def ensure_index(index_like, copy=False): """ - Ensure that we have an index from some index-like object + Ensure that we have an index from some index-like object. Parameters ---------- @@ -5063,7 +5299,9 @@ def ensure_index(index_like, copy=False): def _ensure_has_len(seq): - """If seq is an iterator, put its values into a list.""" + """ + If seq is an iterator, put its values into a list. + """ try: len(seq) except TypeError: @@ -5074,7 +5312,7 @@ def _ensure_has_len(seq): def _trim_front(strings): """ - Trims zeros and decimal points + Trims zeros and decimal points. """ trimmed = strings while len(strings) > 0 and all(x[0] == ' ' for x in trimmed): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 45703c220a4be..6d26894514a9c 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,30 +1,29 @@ import operator +import warnings import numpy as np -from pandas._libs import index as libindex -from pandas import compat +from pandas._libs import index as libindex +import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.core.dtypes.generic import ABCCategorical, ABCSeries -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.util._decorators import Appender, cache_readonly + from pandas.core.dtypes.common import ( - is_categorical_dtype, - ensure_platform_int, - is_list_like, - is_interval_dtype, + ensure_platform_int, is_categorical_dtype, is_interval_dtype, is_list_like, is_scalar) -from pandas.core.dtypes.missing import array_equivalent, isna -from pandas.core.algorithms import take_1d - +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.generic import ABCCategorical, ABCSeries +from pandas.core.dtypes.missing import isna -from pandas.util._decorators import Appender, cache_readonly -from pandas.core.config import get_option -from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core import accessor +from pandas.core.algorithms import take_1d +from pandas.core.arrays.categorical import Categorical, contains import pandas.core.common as com -import pandas.core.missing as missing +from pandas.core.config import get_option import pandas.core.indexes.base as ibase -from pandas.core.arrays.categorical import Categorical, contains +from pandas.core.indexes.base import Index, _index_shared_docs +import pandas.core.missing as missing +from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -43,7 +42,6 @@ typ='method', overwrite=True) class CategoricalIndex(Index, accessor.PandasDelegate): """ - Immutable Index implementing an ordered, sliceable set. CategoricalIndex represents a sparsely populated Index with an underlying Categorical. @@ -83,14 +81,31 @@ class CategoricalIndex(Index, accessor.PandasDelegate): """ _typ = 'categoricalindex' - _engine_type = libindex.Int64Engine + + @property + def _engine_type(self): + # self.codes can have dtype int8, int16, int32 or int64, so we need + # to return the corresponding engine type (libindex.Int8Engine, etc.). + return {np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine, + }[self.codes.dtype.type] + _attributes = ['name'] + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, categories=None, ordered=None, dtype=None, - copy=False, name=None, fastpath=False): + copy=False, name=None, fastpath=None): - if fastpath: - return cls._simple_new(data, name=name, dtype=dtype) + if fastpath is not None: + warnings.warn("The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, stacklevel=2) + if fastpath: + return cls._simple_new(data, name=name, dtype=dtype) if name is None and hasattr(data, 'name'): name = data.name @@ -200,6 +215,8 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, dtype=None, **kwargs): @@ -266,12 +283,17 @@ def equals(self, other): try: other = self._is_dtype_compat(other) - return array_equivalent(self._data, other) + if isinstance(other, type(self)): + other = other._data + return self._data.equals(other) except (TypeError, ValueError): pass return False + # -------------------------------------------------------------------- + # Rendering Methods + @property def _formatter_func(self): return self.categories._formatter_func @@ -295,6 +317,8 @@ def _format_attrs(self): attrs.append(('length', len(self))) return attrs + # -------------------------------------------------------------------- + @property def inferred_type(self): return 'categorical' @@ -309,6 +333,10 @@ def itemsize(self): # Size of the items in categories, not codes. return self.values.itemsize + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) + return self._shallow_copy(result, name=name) + def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() @@ -377,7 +405,7 @@ def argsort(self, *args, **kwargs): def _engine(self): # we are going to look things up with the codes themselves - return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + return self._engine_type(lambda: self.codes, len(self)) # introspection @cache_readonly @@ -426,6 +454,10 @@ def get_loc(self, key, method=None): ------- loc : int if unique index, slice if monotonic index, else mask + Raises + ------ + KeyError : if the key is not in the index + Examples --------- >>> unique_index = pd.CategoricalIndex(list('abc')) @@ -440,10 +472,12 @@ def get_loc(self, key, method=None): >>> non_monotonic_index.get_loc('b') array([False, True, False, True], dtype=bool) """ - codes = self.categories.get_loc(key) - if (codes == -1): + code = self.categories.get_loc(key) + code = self.codes.dtype.type(code) + try: + return self._engine.get_loc(code) + except KeyError: raise KeyError(key) - return self._engine.get_loc(codes) def get_value(self, series, key): """ @@ -500,12 +534,16 @@ def reindex(self, target, method=None, level=None, limit=None, target = ibase.ensure_index(target) - if not is_categorical_dtype(target) and not target.is_unique: - raise ValueError("cannot reindex with a non-unique indexer") + if self.equals(target): + indexer = None + missing = [] + else: + if not target.is_unique: + raise ValueError("cannot reindex with a non-unique indexer") - indexer, missing = self.get_indexer_non_unique(np.array(target)) + indexer, missing = self.get_indexer_non_unique(np.array(target)) - if len(self.codes): + if len(self.codes) and indexer is not None: new_target = self.take(indexer) else: new_target = target diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1ec30ecbb3a3b..52127811b584a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,215 +2,45 @@ """ Base and utility classes for tseries type pandas objects. """ +import operator import warnings -from pandas import compat -from pandas.compat.numpy import function as nv -from pandas.core.tools.timedeltas import to_timedelta - import numpy as np -from pandas._libs import lib, iNaT, NaT -from pandas._libs.tslibs.timestamps import round_nsint64, RoundTo +from pandas._libs import NaT, iNaT, lib +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - ensure_int64, - is_dtype_equal, - is_float, - is_integer, - is_list_like, - is_scalar, - is_bool_dtype, - is_period_dtype, - is_categorical_dtype, - is_datetime_or_timedelta_dtype, - is_float_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype) -from pandas.core.dtypes.generic import ( - ABCIndex, ABCSeries, ABCIndexClass) -from pandas.core.dtypes.missing import isna -from pandas.core import common as com, algorithms, ops + ensure_int64, is_bool_dtype, is_categorical_dtype, + is_datetime_or_timedelta_dtype, is_dtype_equal, is_float, is_float_dtype, + is_integer, is_integer_dtype, is_list_like, is_object_dtype, + is_period_dtype, is_scalar, is_string_dtype) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries + +from pandas.core import algorithms, ops +from pandas.core.accessor import PandasDelegate +from pandas.core.arrays.datetimelike import ( + DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8) +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.tools.timedeltas import to_timedelta import pandas.io.formats.printing as printing -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -from pandas.core.indexes.base import Index, _index_shared_docs -from pandas.util._decorators import Appender, cache_readonly -import pandas.core.dtypes.concat as _concat - -import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) -class DatelikeOps(object): - """ common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex """ - - def strftime(self, date_format): - return Index(self.format(date_format=date_format), - dtype=compat.text_type) - strftime.__doc__ = """ - Convert to Index using specified date_format. - - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. Details - of the string format can be found in `python string format doc <{0}>`__ - - Parameters - ---------- - date_format : str - Date format string (e.g. "%Y-%m-%d"). - - Returns - ------- - Index - Index of formatted strings - - See Also - -------- - pandas.to_datetime : Convert the given argument to datetime - DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. - DatetimeIndex.round : Round the DatetimeIndex to the specified freq. - DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. - - Examples - -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%B %d, %Y, %r') - Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', - 'March 10, 2018, 09:00:02 AM'], - dtype='object') - """.format("https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior") - - -class TimelikeOps(object): - """ common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex """ - - _round_doc = ( - """ - {op} the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - :ref:`frequency aliases ` for - a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times - Only relevant for DatetimeIndex - - .. versionadded:: 0.24.0 - - Returns - ------- - DatetimeIndex, TimedeltaIndex, or Series - Index of the same type for a DatetimeIndex or TimedeltaIndex, - or a Series with the same index for a Series. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq='T') - """) - - _round_example = ( - """>>> rng.round('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.round("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """) - - _floor_example = ( - """>>> rng.floor('H') - DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.floor("H") - 0 2018-01-01 11:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - ) - - _ceil_example = ( - """>>> rng.ceil('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 13:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.ceil("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 13:00:00 - dtype: datetime64[ns] - """ - ) - - def _round(self, freq, mode, ambiguous): - # round the local times - values = _ensure_datetimelike_to_i8(self) - result = round_nsint64(values, mode, freq) - result = self._maybe_mask_results(result, fill_value=NaT) - - attribs = self._get_attributes_dict() - if 'freq' in attribs: - attribs['freq'] = None - if 'tz' in attribs: - attribs['tz'] = None - return self._ensure_localized( - self._shallow_copy(result, **attribs), ambiguous - ) - - @Appender((_round_doc + _round_example).format(op="round")) - def round(self, freq, ambiguous='raise'): - return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous) - - @Appender((_round_doc + _floor_example).format(op="floor")) - def floor(self, freq, ambiguous='raise'): - return self._round(freq, RoundTo.MINUS_INFTY, ambiguous) - - @Appender((_round_doc + _ceil_example).format(op="ceil")) - def ceil(self, freq, ambiguous='raise'): - return self._round(freq, RoundTo.PLUS_INFTY, ambiguous) - - class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin): - """ common ops mixin to support a unified interface datetimelike Index """ + """ + common ops mixin to support a unified interface datetimelike Index + """ + + # override DatetimeLikeArrayMixin method + copy = Index.copy + unique = Index.unique + take = Index.take # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index @@ -250,7 +80,9 @@ def equals(self, other): @staticmethod def _join_i8_wrapper(joinf, dtype, with_indexers=True): - """ create the join wrapper methods """ + """ + Create the join wrapper methods. + """ @staticmethod def wrapper(left, right): @@ -277,9 +109,10 @@ def _evaluate_compare(self, other, op): except TypeError: return result - def _ensure_localized(self, arg, ambiguous='raise', from_utc=False): + def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', + from_utc=False): """ - ensure that we are re-localized + Ensure that we are re-localized. This is for compat as we can then call this on all datetimelike indexes generally (ignored for Period/Timedelta) @@ -288,6 +121,7 @@ def _ensure_localized(self, arg, ambiguous='raise', from_utc=False): ---------- arg : DatetimeIndex / i8 ndarray ambiguous : str, bool, or bool-ndarray, default 'raise' + nonexistent : str, default 'raise' from_utc : bool, default False If True, localize the i8 ndarray to UTC first before converting to the appropriate tz. If False, localize directly to the tz. @@ -304,19 +138,18 @@ def _ensure_localized(self, arg, ambiguous='raise', from_utc=False): if from_utc: arg = arg.tz_localize('UTC').tz_convert(self.tz) else: - arg = arg.tz_localize(self.tz, ambiguous=ambiguous) + arg = arg.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) return arg def _box_values_as_index(self): """ - return object Index which contains boxed values + Return object Index which contains boxed values. """ from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object) - def _format_with_header(self, header, **kwargs): - return header + list(self._format_native_types(**kwargs)) - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): try: @@ -346,7 +179,7 @@ def map(self, f): def sort_values(self, return_indexer=False, ascending=True): """ - Return sorted copy of Index + Return sorted copy of Index. """ if return_indexer: _as = self.argsort() @@ -369,6 +202,9 @@ def sort_values(self, return_indexer=False, ascending=True): if not ascending: sorted_values = sorted_values[::-1] + sorted_values = self._maybe_box_as_values(sorted_values, + **attribs) + return self._simple_new(sorted_values, **attribs) @Appender(_index_shared_docs['take'] % _index_doc_kwargs) @@ -397,7 +233,8 @@ def take(self, indices, axis=0, allow_fill=True, @property def asobject(self): - """Return object Index which contains boxed values. + """ + Return object Index which contains boxed values. .. deprecated:: 0.23.0 Use ``astype(object)`` instead. @@ -417,7 +254,7 @@ def _convert_tolerance(self, tolerance, target): def tolist(self): """ - return a list of the underlying data + Return a list of the underlying data. """ return list(self.astype(object)) @@ -426,11 +263,12 @@ def min(self, axis=None, *args, **kwargs): Return the minimum value of the Index or minimum along an axis. - See also + See Also -------- numpy.ndarray.min """ nv.validate_min(args, kwargs) + nv.validate_minmax_axis(axis) try: i8 = self.asi8 @@ -451,14 +289,16 @@ def min(self, axis=None, *args, **kwargs): def argmin(self, axis=None, *args, **kwargs): """ Returns the indices of the minimum values along an axis. + See `numpy.ndarray.argmin` for more information on the `axis` parameter. - See also + See Also -------- numpy.ndarray.argmin """ nv.validate_argmin(args, kwargs) + nv.validate_minmax_axis(axis) i8 = self.asi8 if self.hasnans: @@ -474,11 +314,12 @@ def max(self, axis=None, *args, **kwargs): Return the maximum value of the Index or maximum along an axis. - See also + See Also -------- numpy.ndarray.max """ nv.validate_max(args, kwargs) + nv.validate_minmax_axis(axis) try: i8 = self.asi8 @@ -499,14 +340,16 @@ def max(self, axis=None, *args, **kwargs): def argmax(self, axis=None, *args, **kwargs): """ Returns the indices of the maximum values along an axis. + See `numpy.ndarray.argmax` for more information on the `axis` parameter. - See also + See Also -------- numpy.ndarray.argmax """ nv.validate_argmax(args, kwargs) + nv.validate_minmax_axis(axis) i8 = self.asi8 if self.hasnans: @@ -517,13 +360,19 @@ def argmax(self, axis=None, *args, **kwargs): i8[mask] = 0 return i8.argmax() + # -------------------------------------------------------------------- + # Rendering Methods + + def _format_with_header(self, header, **kwargs): + return header + list(self._format_native_types(**kwargs)) + @property def _formatter_func(self): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _format_attrs(self): """ - Return a list of tuples of the (attr,formatted_value) + Return a list of tuples of the (attr,formatted_value). """ attrs = super(DatetimeIndexOpsMixin, self)._format_attrs() for attrib in self._attributes: @@ -534,10 +383,12 @@ def _format_attrs(self): attrs.append(('freq', freq)) return attrs + # -------------------------------------------------------------------- + def _convert_scalar_indexer(self, key, kind=None): """ - we don't allow integer or float indexing on datetime-like when using - loc + We don't allow integer or float indexing on datetime-like when using + loc. Parameters ---------- @@ -563,8 +414,8 @@ def _convert_scalar_indexer(self, key, kind=None): @classmethod def _add_datetimelike_methods(cls): """ - add in the datetimelike methods (as we may have to override the - superclass) + Add in the datetimelike methods (as we may have to override the + superclass). """ def __add__(self, other): @@ -595,7 +446,7 @@ def __rsub__(self, other): def isin(self, values): """ Compute boolean array of whether each index value is found in the - passed set of values + passed set of values. Parameters ---------- @@ -615,7 +466,7 @@ def isin(self, values): def repeat(self, repeats, *args, **kwargs): """ - Analogous to ndarray.repeat + Analogous to ndarray.repeat. """ nv.validate_repeat(args, kwargs) if is_period_dtype(self): @@ -632,12 +483,11 @@ def where(self, cond, other=None): result = np.where(cond, values, other).astype('i8') result = self._ensure_localized(result, from_utc=True) - return self._shallow_copy(result, - **self._get_attributes_dict()) + return self._shallow_copy(result) def _summary(self, name=None): """ - Return a summarized representation + Return a summarized representation. Parameters ---------- @@ -668,27 +518,44 @@ def _summary(self, name=None): def _concat_same_dtype(self, to_concat, name): """ - Concatenate to_concat which has the same class + Concatenate to_concat which has the same class. """ attribs = self._get_attributes_dict() attribs['name'] = name + # do not pass tz to set because tzlocal cannot be hashed + if len({str(x.dtype) for x in to_concat}) != 1: + raise ValueError('to_concat must have the same tz') if not is_period_dtype(self): # reset freq attribs['freq'] = None - - if getattr(self, 'tz', None) is not None: - return _concat._concat_datetimetz(to_concat, name) + # TODO(DatetimeArray) + # - remove the .asi8 here + # - remove the _maybe_box_as_values + # - combine with the `else` block + new_data = self._concat_same_type(to_concat).asi8 else: - new_data = np.concatenate([c.asi8 for c in to_concat]) + new_data = type(self._values)._concat_same_type(to_concat) + return self._simple_new(new_data, **attribs) + def _maybe_box_as_values(self, values, **attribs): + # TODO(DatetimeArray): remove + # This is a temporary shim while PeriodArray is an ExtensoinArray, + # but others are not. When everyone is an ExtensionArray, this can + # be removed. Currently used in + # - sort_values + return values + def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values_as_index() elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) elif is_integer_dtype(dtype): + # TODO(DatetimeArray): use self._values here. + # Can't use ._values currently, because that returns a + # DatetimeIndex, which throws us in an infinite loop. return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') elif (is_datetime_or_timedelta_dtype(dtype) and @@ -699,44 +566,23 @@ def astype(self, dtype, copy=True): raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy) - -def _ensure_datetimelike_to_i8(other, to_utc=False): - """ - helper for coercing an input scalar or array to i8 - - Parameters - ---------- - other : 1d array - to_utc : bool, default False - If True, convert the values to UTC before extracting the i8 values - If False, extract the i8 values directly. - - Returns - ------- - i8 1d array - """ - if is_scalar(other) and isna(other): - return iNaT - elif isinstance(other, ABCIndexClass): - # convert tz if needed - if getattr(other, 'tz', None) is not None: - if to_utc: - other = other.tz_convert('UTC') - else: - other = other.tz_localize(None) - else: - try: - return np.array(other, copy=False).view('i8') - except TypeError: - # period array cannot be coerces to int - other = Index(other) - return other.asi8 + @Appender(DatetimeLikeArrayMixin._time_shift.__doc__) + def _time_shift(self, periods, freq=None): + result = DatetimeLikeArrayMixin._time_shift(self, periods, freq=freq) + result.name = self.name + return result def wrap_arithmetic_op(self, other, result): if result is NotImplemented: return NotImplemented + if isinstance(result, tuple): + # divmod, rdivmod + assert len(result) == 2 + return (wrap_arithmetic_op(self, other, result[0]), + wrap_arithmetic_op(self, other, result[1])) + if not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) @@ -744,3 +590,105 @@ def wrap_arithmetic_op(self, other, result): res_name = ops.get_op_result_name(self, other) result.name = res_name return result + + +def wrap_array_method(method, pin_name=False): + """ + Wrap a DatetimeArray/TimedeltaArray/PeriodArray method so that the + returned object is an Index subclass instead of ndarray or ExtensionArray + subclass. + + Parameters + ---------- + method : method of Datetime/Timedelta/Period Array class + pin_name : bool + Whether to set name=self.name on the output Index + + Returns + ------- + method + """ + def index_method(self, *args, **kwargs): + result = method(self, *args, **kwargs) + + # Index.__new__ will choose the appropriate subclass to return + result = Index(result) + if pin_name: + result.name = self.name + return result + + index_method.__name__ = method.__name__ + index_method.__doc__ = method.__doc__ + return index_method + + +def wrap_field_accessor(prop): + """ + Wrap a DatetimeArray/TimedeltaArray/PeriodArray array-returning property + to return an Index subclass instead of ndarray or ExtensionArray subclass. + + Parameters + ---------- + prop : property + + Returns + ------- + new_prop : property + """ + fget = prop.fget + + def f(self): + result = fget(self) + if is_bool_dtype(result): + # return numpy array b/c there is no BoolIndex + return result + return Index(result, name=self.name) + + f.__name__ = fget.__name__ + f.__doc__ = fget.__doc__ + return property(f) + + +class DatetimelikeDelegateMixin(PandasDelegate): + """ + Delegation mechanism, specific for Datetime, Timedelta, and Period types. + + Functionality is delegated from the Index class to an Array class. A + few things can be customized + + * _delegate_class : type + The class being delegated to. + * _delegated_methods, delegated_properties : List + The list of property / method names being delagated. + * raw_methods : Set + The set of methods whose results should should *not* be + boxed in an index, after being returned from the array + * raw_properties : Set + The set of properties whose results should should *not* be + boxed in an index, after being returned from the array + """ + # raw_methods : dispatch methods that shouldn't be boxed in an Index + _raw_methods = set() + # raw_properties : dispatch properties that shouldn't be boxed in an Index + _raw_properties = set() + name = None + _data = None + + @property + def _delegate_class(self): + raise AbstractMethodError + + def _delegate_property_get(self, name, *args, **kwargs): + result = getattr(self._data, name) + if name not in self._raw_properties: + result = Index(result, name=self.name) + return result + + def _delegate_property_set(self, name, value, *args, **kwargs): + setattr(self._data, name, value) + + def _delegate_method(self, name, *args, **kwargs): + result = operator.methodcaller(name, *args, **kwargs)(self._data) + if name not in self._raw_methods: + result = Index(result, name=self.name) + return result diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index a6cdaa0c2163a..fd4a1527c07b7 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,100 +1,39 @@ # pylint: disable=E1101 from __future__ import division + +from datetime import datetime, time, timedelta import operator import warnings -from datetime import time, datetime, timedelta import numpy as np -from pytz import utc -from pandas.core.base import _shared_docs +from pandas._libs import ( + Timestamp, index as libindex, join as libjoin, lib, tslib as libts) +from pandas._libs.tslibs import ccalendar, fields, parsing, timezones +import pandas.compat as compat +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - _INT64_DTYPE, - _NS_DTYPE, - is_datetime64_dtype, - is_datetimetz, - is_dtype_equal, - is_integer, - is_float, - is_integer_dtype, - is_datetime64_ns_dtype, - is_period_dtype, - is_bool_dtype, - is_string_like, - is_list_like, - is_scalar, - pandas_dtype, - ensure_int64) -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.missing import isna - + _NS_DTYPE, ensure_int64, is_datetime64_ns_dtype, is_dtype_equal, is_float, + is_integer, is_list_like, is_period_dtype, is_scalar, is_string_like, + pandas_dtype) import pandas.core.dtypes.concat as _concat -from pandas.core.arrays.datetimes import DatetimeArrayMixin, _to_m8 -from pandas.core.arrays import datetimelike as dtl +from pandas.core.dtypes.missing import isna +from pandas.core.arrays.datetimes import ( + DatetimeArrayMixin as DatetimeArray, _to_m8) +from pandas.core.base import _shared_docs +import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs -from pandas.core.indexes.numeric import Int64Index, Float64Index -import pandas.compat as compat -from pandas.tseries.frequencies import to_offset, get_period_alias, Resolution from pandas.core.indexes.datetimelike import ( - DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin) -from pandas.tseries.offsets import ( - generate_range, CDay, prefix_mapping) - -from pandas.core.tools.timedeltas import to_timedelta -from pandas.util._decorators import Appender, cache_readonly, Substitution -import pandas.core.common as com -import pandas.tseries.offsets as offsets + DatetimeIndexOpsMixin, wrap_array_method, wrap_field_accessor) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools -from pandas._libs import (lib, index as libindex, tslib as libts, - join as libjoin, Timestamp) -from pandas._libs.tslibs import (timezones, conversion, fields, parsing, - ccalendar) - -# -------- some conversion wrapper functions - - -def _wrap_field_accessor(name): - fget = getattr(DatetimeArrayMixin, name).fget - - def f(self): - result = fget(self) - if is_bool_dtype(result): - return result - return Index(result, name=self.name) - - f.__name__ = name - f.__doc__ = fget.__doc__ - return property(f) - - -def _wrap_in_index(name): - meth = getattr(DatetimeArrayMixin, name) - - def func(self, *args, **kwargs): - result = meth(self, *args, **kwargs) - return Index(result, name=self.name) - - func.__doc__ = meth.__doc__ - func.__name__ = name - return func - - -def _dt_index_cmp(cls, op): - """ - Wrap comparison operations to convert datetime-like to datetime64 - """ - opname = '__{name}__'.format(name=op.__name__) - - def wrapper(self, other): - result = getattr(DatetimeArrayMixin, opname)(self, other) - if is_bool_dtype(result): - return result - return Index(result) - - return compat.set_function_name(wrapper, opname, cls) +from pandas.tseries import offsets +from pandas.tseries.frequencies import Resolution, to_offset +from pandas.tseries.offsets import CDay, prefix_mapping def _new_DatetimeIndex(cls, d): @@ -105,14 +44,24 @@ def _new_DatetimeIndex(cls, d): # so need to localize tz = d.pop('tz', None) - result = cls.__new__(cls, verify_integrity=False, **d) + if "data" in d and not isinstance(d["data"], DatetimeIndex): + # Avoid need to verify integrity by calling simple_new directly + data = d.pop("data") + result = cls._simple_new(data, **d) + else: + with warnings.catch_warnings(): + # we ignore warnings from passing verify_integrity=False + # TODO: If we knew what was going in to **d, we might be able to + # go through _simple_new instead + warnings.simplefilter("ignore") + result = cls.__new__(cls, verify_integrity=False, **d) + if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result -class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps, - DatetimeIndexOpsMixin, Int64Index): +class DatetimeIndex(DatetimeArray, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of datetime64 data, represented internally as int64, and which can be boxed to Timestamp objects that are subclasses of datetime and @@ -143,6 +92,12 @@ class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps, the 'left', 'right', or both sides (None) tz : pytz.timezone or dateutil.tz.tzfile ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from 03:00 + DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC + and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter + dictates how ambiguous times should be handled. + - 'infer' will attempt to infer fall dst-transition hours based on order - bool-ndarray where True signifies a DST time, False signifies a @@ -213,13 +168,11 @@ class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps, See Also --------- - Index : The base pandas Index type - TimedeltaIndex : Index of timedelta64 data - PeriodIndex : Index of Period data - pandas.to_datetime : Convert argument to datetime + Index : The base pandas Index type. + TimedeltaIndex : Index of timedelta64 data. + PeriodIndex : Index of Period data. + pandas.to_datetime : Convert argument to datetime. """ - _resolution = cache_readonly(DatetimeArrayMixin._resolution.fget) - _typ = 'datetimeindex' _join_precedence = 10 @@ -233,23 +186,17 @@ def _join_i8_wrapper(joinf, **kwargs): _left_indexer_unique = _join_i8_wrapper( libjoin.left_join_indexer_unique_int64, with_indexers=False) - @classmethod - def _add_comparison_methods(cls): - """ add in comparison methods """ - cls.__eq__ = _dt_index_cmp(cls, operator.eq) - cls.__ne__ = _dt_index_cmp(cls, operator.ne) - cls.__lt__ = _dt_index_cmp(cls, operator.lt) - cls.__gt__ = _dt_index_cmp(cls, operator.gt) - cls.__le__ = _dt_index_cmp(cls, operator.le) - cls.__ge__ = _dt_index_cmp(cls, operator.ge) - _engine_type = libindex.DatetimeEngine - tz = None + _tz = None _freq = None _comparables = ['name', 'freqstr', 'tz'] _attributes = ['name', 'freq', 'tz'] + # dummy attribute so that datetime.__eq__(DatetimeArray) defers + # by returning NotImplemented + timetuple = None + # define my properties & methods for delegation _bool_ops = ['is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', @@ -269,14 +216,40 @@ def _add_comparison_methods(cls): _is_numeric_dtype = False _infer_as_myclass = True - _timezone = cache_readonly(DatetimeArrayMixin._timezone.fget) - is_normalized = cache_readonly(DatetimeArrayMixin.is_normalized.fget) + + # -------------------------------------------------------------------- + # Constructors def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, tz=None, normalize=False, closed=None, ambiguous='raise', dayfirst=False, yearfirst=False, dtype=None, - copy=False, name=None, verify_integrity=True): + copy=False, name=None, verify_integrity=None): + + if verify_integrity is not None: + warnings.warn("The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, stacklevel=2) + else: + verify_integrity = True + + if data is None: + warnings.warn("Creating a DatetimeIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.date_range` instead.", + FutureWarning, stacklevel=2) + result = cls._generate_range(start, end, periods, + freq=freq, tz=tz, normalize=normalize, + closed=closed, ambiguous=ambiguous) + result.name = name + return result + + if is_scalar(data): + raise TypeError("{cls}() must be called with a " + "collection of some kind, {data} was passed" + .format(cls=cls.__name__, data=repr(data))) + + # - Cases checked above all return/raise before reaching here - # # This allows to later ensure that the 'copy' parameter is honored: if isinstance(data, Index): @@ -287,144 +260,31 @@ def __new__(cls, data=None, if name is None and hasattr(data, 'name'): name = data.name - freq, freq_infer = dtl.maybe_infer_freq(freq) + dtarr = DatetimeArray._from_sequence( + data, dtype=dtype, copy=copy, tz=tz, freq=freq, + dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous) - # if dtype has an embedded tz, capture it - tz = dtl.validate_tz_from_dtype(dtype, tz) - - if data is None: - # TODO: Remove this block and associated kwargs; GH#20535 - if freq is None and com._any_none(periods, start, end): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') - periods = dtl.validate_periods(periods) - return cls._generate_range(start, end, periods, name, freq, - tz=tz, normalize=normalize, - closed=closed, ambiguous=ambiguous) - - if not isinstance(data, (np.ndarray, Index, ABCSeries)): - if is_scalar(data): - raise ValueError('DatetimeIndex() must be called with a ' - 'collection of some kind, %s was passed' - % repr(data)) - # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - data = np.asarray(data, dtype='O') - elif isinstance(data, ABCSeries): - data = data._values - - # data must be Index or np.ndarray here - if not (is_datetime64_dtype(data) or is_datetimetz(data) or - is_integer_dtype(data) or lib.infer_dtype(data) == 'integer'): - data = tools.to_datetime(data, dayfirst=dayfirst, - yearfirst=yearfirst) - - if isinstance(data, DatetimeArrayMixin): - if tz is None: - tz = data.tz - elif data.tz is None: - data = data.tz_localize(tz, ambiguous=ambiguous) - else: - # the tz's must match - if str(tz) != str(data.tz): - msg = ('data is already tz-aware {0}, unable to ' - 'set specified tz: {1}') - raise TypeError(msg.format(data.tz, tz)) - - subarr = data.values - - if freq is None: - freq = data.freq - verify_integrity = False - elif issubclass(data.dtype.type, np.datetime64): - if data.dtype != _NS_DTYPE: - data = conversion.ensure_datetime64ns(data) - if tz is not None: - # Convert tz-naive to UTC - tz = timezones.maybe_get_tz(tz) - data = conversion.tz_localize_to_utc(data.view('i8'), tz, - ambiguous=ambiguous) - subarr = data.view(_NS_DTYPE) - else: - # must be integer dtype otherwise - # assume this data are epoch timestamps - if data.dtype != _INT64_DTYPE: - data = data.astype(np.int64, copy=False) - subarr = data.view(_NS_DTYPE) - - subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) - if dtype is not None: - if not is_dtype_equal(subarr.dtype, dtype): - # dtype must be coerced to DatetimeTZDtype above - if subarr.tz is not None: - raise ValueError("cannot localize from non-UTC data") - - if verify_integrity and len(subarr) > 0: - if freq is not None and not freq_infer: - cls._validate_frequency(subarr, freq, ambiguous=ambiguous) - - if freq_infer: - inferred = subarr.inferred_freq - if inferred: - subarr.freq = to_offset(inferred) + subarr = cls._simple_new(dtarr._data, name=name, + freq=dtarr.freq, tz=dtarr.tz) return subarr._deepcopy_if_needed(ref_to_data, copy) @classmethod - @Appender(DatetimeArrayMixin._generate_range.__doc__) - def _generate_range(cls, start, end, periods, name=None, freq=None, - tz=None, normalize=False, ambiguous='raise', - closed=None): - out = super(DatetimeIndex, cls)._generate_range( - start, end, periods, freq, - tz=tz, normalize=normalize, ambiguous=ambiguous, closed=closed) - out.name = name - return out - - @classmethod - def _use_cached_range(cls, freq, _normalized, start, end): - # Note: This always returns False - return (freq._should_cache() and - not (freq._normalize_cache and not _normalized) and - _naive_in_cache_range(start, end)) - - def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - if self._has_same_tz(value): - return _to_m8(value) - raise ValueError('Passed item and index have different timezone') - - @classmethod - def _simple_new(cls, values, name=None, freq=None, tz=None, - dtype=None, **kwargs): + def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): """ we require the we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor """ + # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes + assert isinstance(values, np.ndarray), type(values) - if getattr(values, 'dtype', None) is None: - # empty, but with dtype compat - if values is None: - values = np.empty(0, dtype=_NS_DTYPE) - return cls(values, name=name, freq=freq, tz=tz, - dtype=dtype, **kwargs) - values = np.array(values, copy=False) - - if not is_datetime64_dtype(values): - values = ensure_int64(values).view(_NS_DTYPE) - - values = getattr(values, 'values', values) - - assert isinstance(values, np.ndarray), "values is not an np.ndarray" - assert is_datetime64_dtype(values) - - result = super(DatetimeIndex, cls)._simple_new(values, freq, tz, - **kwargs) + result = super(DatetimeIndex, cls)._simple_new(values, freq, tz) result.name = name result._reset_identity() return result + # -------------------------------------------------------------------- + @property def _values(self): # tz-naive -> ndarray @@ -466,87 +326,12 @@ def nbytes(self): # for TZ-aware return self._ndarray_values.nbytes - @classmethod - def _cached_range(cls, start=None, end=None, periods=None, freq=None, - name=None): - if start is None and end is None: - # I somewhat believe this should never be raised externally - raise TypeError('Must specify either start or end.') - if start is not None: - start = Timestamp(start) - if end is not None: - end = Timestamp(end) - if (start is None or end is None) and periods is None: - raise TypeError( - 'Must either specify period or provide both start and end.') - - if freq is None: - # This can't happen with external-facing code - raise TypeError('Must provide freq.') - - drc = _daterange_cache - if freq not in _daterange_cache: - xdr = generate_range(offset=freq, start=_CACHE_START, - end=_CACHE_END) - - arr = tools.to_datetime(list(xdr), box=False) - - cachedRange = DatetimeIndex._simple_new(arr) - cachedRange.freq = freq - cachedRange = cachedRange.tz_localize(None) - cachedRange.name = None - drc[freq] = cachedRange - else: - cachedRange = drc[freq] - - if start is None: - if not isinstance(end, Timestamp): - raise AssertionError('end must be an instance of Timestamp') - - end = freq.rollback(end) - - endLoc = cachedRange.get_loc(end) + 1 - startLoc = endLoc - periods - elif end is None: - if not isinstance(start, Timestamp): - raise AssertionError('start must be an instance of Timestamp') - - start = freq.rollforward(start) - - startLoc = cachedRange.get_loc(start) - endLoc = startLoc + periods - else: - if not freq.onOffset(start): - start = freq.rollforward(start) - - if not freq.onOffset(end): - end = freq.rollback(end) - - startLoc = cachedRange.get_loc(start) - endLoc = cachedRange.get_loc(end) + 1 - - indexSlice = cachedRange[startLoc:endLoc] - indexSlice.name = name - indexSlice.freq = freq - - return indexSlice - - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return libts.ints_to_pydatetime(self.asi8, self.tz) - @cache_readonly def _is_dates_only(self): """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only return _is_dates_only(self.values) and self.tz is None - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_datetime64 - formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: "'%s'" % formatter(x, tz=self.tz) - def __reduce__(self): # we use a special reudce here because we need @@ -573,11 +358,6 @@ def __setstate__(self, state): self._freq = own_state[1] self._tz = timezones.tz_standardize(own_state[2]) - # provide numpy < 1.7 compat - if nd_state[2] == 'M8[us]': - new_state = np.ndarray.__reduce__(data.astype('M8[ns]')) - np.ndarray.__setstate__(data, new_state[2]) - else: # pragma: no cover data = np.empty(state) np.ndarray.__setstate__(data, state) @@ -589,6 +369,12 @@ def __setstate__(self, state): raise Exception("invalid pickle state") _unpickle_compat = __setstate__ + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + if self._has_same_tz(value): + return _to_m8(value) + raise ValueError('Passed item and index have different timezone') + def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ freq = attrs.get('freq', None) @@ -597,6 +383,13 @@ def _maybe_update_attributes(self, attrs): attrs['freq'] = 'infer' return attrs + # -------------------------------------------------------------------- + # Rendering Methods + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return libts.ints_to_pydatetime(self.asi8, self.tz) + def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) @@ -606,171 +399,14 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): format=format, na_rep=na_rep) - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if (is_datetime64_ns_dtype(dtype) and - not is_dtype_equal(dtype, self.dtype)): - # GH 18951: datetime64_ns dtype but not equal means different tz - new_tz = getattr(dtype, 'tz', None) - if getattr(self.dtype, 'tz', None) is None: - return self.tz_localize(new_tz) - return self.tz_convert(new_tz) - elif is_period_dtype(dtype): - return self.to_period(freq=dtype.freq) - return super(DatetimeIndex, self).astype(dtype, copy=copy) - - def _get_time_micros(self): - values = self.asi8 - if self.tz is not None and self.tz is not utc: - values = self._local_timestamps() - return fields.get_time_micros(values) - - def to_series(self, keep_tz=False, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index - - Parameters - ---------- - keep_tz : optional, defaults False. - return the data keeping the timezone. - - If keep_tz is True: - - If the timezone is not set, the resulting - Series will have a datetime64[ns] dtype. - - Otherwise the Series will have an datetime64[ns, tz] dtype; the - tz will be preserved. - - If keep_tz is False: - - Series will have a datetime64[ns] dtype. TZ aware - objects will have the tz removed. - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index - - Returns - ------- - Series - """ - from pandas import Series - - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name - - if keep_tz and self.tz is not None: - # preserve the tz & copy - values = self.copy(deep=True) - else: - values = self.values.copy() - - return Series(values, index=index, name=name) - - def to_period(self, freq=None): - """ - Cast to PeriodIndex at a particular frequency. - - Converts DatetimeIndex to PeriodIndex. - - Parameters - ---------- - freq : string or Offset, optional - One of pandas' :ref:`offset strings ` - or an Offset object. Will be inferred by default. - - Returns - ------- - PeriodIndex - - Raises - ------ - ValueError - When converting a DatetimeIndex with non-regular values, so that a - frequency cannot be inferred. - - Examples - -------- - >>> df = pd.DataFrame({"y": [1,2,3]}, - ... index=pd.to_datetime(["2000-03-31 00:00:00", - ... "2000-05-31 00:00:00", - ... "2000-08-31 00:00:00"])) - >>> df.index.to_period("M") - PeriodIndex(['2000-03', '2000-05', '2000-08'], - dtype='period[M]', freq='M') - - Infer the daily frequency - - >>> idx = pd.date_range("2017-01-01", periods=2) - >>> idx.to_period() - PeriodIndex(['2017-01-01', '2017-01-02'], - dtype='period[D]', freq='D') - - See also - -------- - pandas.PeriodIndex: Immutable ndarray holding ordinal values - pandas.DatetimeIndex.to_pydatetime: Return DatetimeIndex as object - """ - from pandas.core.indexes.period import PeriodIndex - - if self.tz is not None: - warnings.warn("Converting to PeriodIndex representation will " - "drop timezone information.", UserWarning) - - if freq is None: - freq = self.freqstr or self.inferred_freq - - if freq is None: - msg = ("You must pass a freq argument as " - "current index has none.") - raise ValueError(msg) - - freq = get_period_alias(freq) - - return PeriodIndex(self.values, name=self.name, freq=freq) - - def snap(self, freq='S'): - """ - Snap time stamps to nearest occurring frequency - - """ - # Superdumb, punting on any optimizing - freq = to_offset(freq) - - snapped = np.empty(len(self), dtype=_NS_DTYPE) - - for i, v in enumerate(self): - s = v - if not freq.onOffset(s): - t0 = freq.rollback(s) - t1 = freq.rollforward(s) - if abs(s - t0) < abs(t1 - s): - s = t0 - else: - s = t1 - snapped[i] = s - - # we know it conforms; skip check - return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_datetime64 + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: "'%s'" % formatter(x, tz=self.tz) - def unique(self, level=None): - # Override here since IndexOpsMixin.unique uses self._values.unique - # For DatetimeIndex with TZ, that's a DatetimeIndex -> recursion error - # So we extract the tz-naive DatetimeIndex, unique that, and wrap the - # result with out TZ. - if self.tz is not None: - naive = type(self)(self._ndarray_values, copy=False) - else: - naive = self - result = super(DatetimeIndex, naive).unique(level=level) - return self._simple_new(result.values, name=self.name, tz=self.tz, - freq=self.freq) + # -------------------------------------------------------------------- + # Set Operation Methods def union(self, other): """ @@ -787,6 +423,10 @@ def union(self, other): y : Index or DatetimeIndex """ self._assert_can_do_setop(other) + + if len(other) == 0 or self.equals(other) or len(self) == 0: + return super(DatetimeIndex, self).union(other) + if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) @@ -806,23 +446,6 @@ def union(self, other): result.freq = to_offset(result.inferred_freq) return result - def to_perioddelta(self, freq): - """ - Calculate TimedeltaIndex of difference between index - values and index converted to periodIndex at specified - freq. Used for vectorized offsets - - Parameters - ---------- - freq: Period frequency - - Returns - ------- - y: TimedeltaIndex - """ - return to_timedelta(self.asi8 - self.to_period(freq) - .to_timestamp().asi8) - def union_many(self, others): """ A bit of a hack to accelerate unioning a collection of indexes @@ -852,51 +475,6 @@ def union_many(self, others): return this - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - """ - See Index.join - """ - if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type not in ('floating', 'integer', 'mixed-integer', - 'mixed-integer-float', 'mixed')): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers, sort=sort) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - elif other.tz is not None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert('UTC') - other = other.tz_convert('UTC') - return this, other - - def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None - if (isinstance(other, DatetimeIndex) and - self.freq == other.freq and - self._can_fast_union(other)): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, 'tz', None) - return self._simple_new(joined, name, tz=tz) - def _can_fast_union(self, other): if not isinstance(other, DatetimeIndex): return False @@ -943,28 +521,25 @@ def _fast_union(self, other): else: left, right = other, self - left_start, left_end = left[0], left[-1] + left_end = left[-1] right_end = right[-1] - if not self.freq._should_cache(): - # concatenate dates - if left_end < right_end: - loc = right.searchsorted(left_end, side='right') - right_chunk = right.values[loc:] - dates = _concat._concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - else: - return left + # TODO: consider re-implementing freq._should_cache for fastpath + + # concatenate dates + if left_end < right_end: + loc = right.searchsorted(left_end, side='right') + right_chunk = right.values[loc:] + dates = _concat._concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) else: - return type(self)(start=left_start, - end=max(left_end, right_end), - freq=left.freq) + return left - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) if not timezones.tz_compare(self.tz, other.tz): raise ValueError('Passed item and index have different timezone') - return self._simple_new(result, name=name, freq=None, tz=self.tz) + return self._shallow_copy(result, name=name, freq=None, tz=self.tz) def intersection(self, other): """ @@ -980,6 +555,10 @@ def intersection(self, other): y : Index or DatetimeIndex """ self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) @@ -1022,6 +601,172 @@ def intersection(self, other): left_chunk = left.values[lslice] return self._shallow_copy(left_chunk) + # -------------------------------------------------------------------- + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if (is_datetime64_ns_dtype(dtype) and + not is_dtype_equal(dtype, self.dtype)): + # GH 18951: datetime64_ns dtype but not equal means different tz + new_tz = getattr(dtype, 'tz', None) + if getattr(self.dtype, 'tz', None) is None: + return self.tz_localize(new_tz) + return self.tz_convert(new_tz) + elif is_period_dtype(dtype): + return self.to_period(freq=dtype.freq) + return super(DatetimeIndex, self).astype(dtype, copy=copy) + + def _get_time_micros(self): + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + return fields.get_time_micros(values) + + def to_series(self, keep_tz=None, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False + Return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set, the resulting + Series will have a datetime64[ns] dtype. + + Otherwise the Series will have an datetime64[ns, tz] dtype; the + tz will be preserved. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. TZ aware + objects will have the tz removed. + + .. versionchanged:: 0.24 + The default value will change to True in a future release. + You can set ``keep_tz=True`` to already obtain the future + behaviour and silence the warning. + + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series + """ + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + if keep_tz is None and self.tz is not None: + warnings.warn("The default of the 'keep_tz' keyword will change " + "to True in a future release. You can set " + "'keep_tz=True' to obtain the future behaviour and " + "silence this warning.", FutureWarning, stacklevel=2) + keep_tz = False + elif keep_tz is False: + warnings.warn("Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", FutureWarning, stacklevel=2) + + if keep_tz and self.tz is not None: + # preserve the tz & copy + values = self.copy(deep=True) + else: + values = self.values.copy() + + return Series(values, index=index, name=name) + + def snap(self, freq='S'): + """ + Snap time stamps to nearest occurring frequency + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + snapped = np.empty(len(self), dtype=_NS_DTYPE) + + for i, v in enumerate(self): + s = v + if not freq.onOffset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + snapped[i] = s + + # we know it conforms; skip check + return DatetimeIndex._simple_new(snapped, freq=freq) + # TODO: what about self.name? tz? if so, use shallow_copy? + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + # TODO(DatetimeArray): change dispatch once inheritance is removed + # call DatetimeArray method + result = DatetimeArray.unique(self) + return self._shallow_copy(result._data) + + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + """ + See Index.join + """ + if (not isinstance(other, DatetimeIndex) and len(other) > 0 and + other.inferred_type not in ('floating', 'integer', 'mixed-integer', + 'mixed-integer-float', 'mixed')): + try: + other = DatetimeIndex(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join(this, other, how=how, level=level, + return_indexers=return_indexers, sort=sort) + + def _maybe_utc_convert(self, other): + this = self + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + elif other.tz is not None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert('UTC') + other = other.tz_convert('UTC') + return this, other + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if (isinstance(other, DatetimeIndex) and + self.freq == other.freq and + self._can_fast_union(other)): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + tz = getattr(other, 'tz', None) + return self._simple_new(joined, name, tz=tz) + def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. @@ -1143,7 +888,10 @@ def get_value(self, series, key): # needed to localize naive datetimes if self.tz is not None: - key = Timestamp(key, tz=self.tz) + if key.tzinfo is not None: + key = Timestamp(key).tz_convert(self.tz) + else: + key = Timestamp(key).tz_localize(self.tz) return self.get_value_maybe_box(series, key) @@ -1169,7 +917,11 @@ def get_value(self, series, key): def get_value_maybe_box(self, series, key): # needed to localize naive datetimes if self.tz is not None: - key = Timestamp(key, tz=self.tz) + key = Timestamp(key) + if key.tzinfo is not None: + key = key.tz_convert(self.tz) + else: + key = key.tz_localize(self.tz) elif not isinstance(key, Timestamp): key = Timestamp(key) values = self._engine.get_value(com.values_from_object(series), @@ -1192,7 +944,10 @@ def get_loc(self, key, method=None, tolerance=None): if isinstance(key, datetime): # needed to localize naive datetimes - key = Timestamp(key, tz=self.tz) + if key.tzinfo is None: + key = Timestamp(key, tz=self.tz) + else: + key = Timestamp(key).tz_convert(self.tz) return Index.get_loc(self, key, method, tolerance) elif isinstance(key, timedelta): @@ -1216,7 +971,11 @@ def get_loc(self, key, method=None, tolerance=None): pass try: - stamp = Timestamp(key, tz=self.tz) + stamp = Timestamp(key) + if stamp.tzinfo is not None and self.tz is not None: + stamp = stamp.tz_convert(self.tz) + else: + stamp = stamp.tz_localize(self.tz) return Index.get_loc(self, stamp, method, tolerance) except KeyError: raise KeyError(key) @@ -1327,38 +1086,57 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): else: raise - year = _wrap_field_accessor('year') - month = _wrap_field_accessor('month') - day = _wrap_field_accessor('day') - hour = _wrap_field_accessor('hour') - minute = _wrap_field_accessor('minute') - second = _wrap_field_accessor('second') - microsecond = _wrap_field_accessor('microsecond') - nanosecond = _wrap_field_accessor('nanosecond') - weekofyear = _wrap_field_accessor('weekofyear') + # -------------------------------------------------------------------- + # Wrapping DatetimeArray + + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + + _timezone = cache_readonly(DatetimeArray._timezone.fget) + is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) + _resolution = cache_readonly(DatetimeArray._resolution.fget) + + year = wrap_field_accessor(DatetimeArray.year) + month = wrap_field_accessor(DatetimeArray.month) + day = wrap_field_accessor(DatetimeArray.day) + hour = wrap_field_accessor(DatetimeArray.hour) + minute = wrap_field_accessor(DatetimeArray.minute) + second = wrap_field_accessor(DatetimeArray.second) + microsecond = wrap_field_accessor(DatetimeArray.microsecond) + nanosecond = wrap_field_accessor(DatetimeArray.nanosecond) + weekofyear = wrap_field_accessor(DatetimeArray.weekofyear) week = weekofyear - dayofweek = _wrap_field_accessor('dayofweek') + dayofweek = wrap_field_accessor(DatetimeArray.dayofweek) weekday = dayofweek - weekday_name = _wrap_field_accessor('weekday_name') + weekday_name = wrap_field_accessor(DatetimeArray.weekday_name) - dayofyear = _wrap_field_accessor('dayofyear') - quarter = _wrap_field_accessor('quarter') - days_in_month = _wrap_field_accessor('days_in_month') + dayofyear = wrap_field_accessor(DatetimeArray.dayofyear) + quarter = wrap_field_accessor(DatetimeArray.quarter) + days_in_month = wrap_field_accessor(DatetimeArray.days_in_month) daysinmonth = days_in_month - is_month_start = _wrap_field_accessor('is_month_start') - is_month_end = _wrap_field_accessor('is_month_end') - is_quarter_start = _wrap_field_accessor('is_quarter_start') - is_quarter_end = _wrap_field_accessor('is_quarter_end') - is_year_start = _wrap_field_accessor('is_year_start') - is_year_end = _wrap_field_accessor('is_year_end') - is_leap_year = _wrap_field_accessor('is_leap_year') - - @Appender(DatetimeArrayMixin.normalize.__doc__) - def normalize(self): - result = DatetimeArrayMixin.normalize(self) - result.name = self.name - return result + is_month_start = wrap_field_accessor(DatetimeArray.is_month_start) + is_month_end = wrap_field_accessor(DatetimeArray.is_month_end) + is_quarter_start = wrap_field_accessor(DatetimeArray.is_quarter_start) + is_quarter_end = wrap_field_accessor(DatetimeArray.is_quarter_end) + is_year_start = wrap_field_accessor(DatetimeArray.is_year_start) + is_year_end = wrap_field_accessor(DatetimeArray.is_year_end) + is_leap_year = wrap_field_accessor(DatetimeArray.is_leap_year) + + tz_localize = wrap_array_method(DatetimeArray.tz_localize, True) + tz_convert = wrap_array_method(DatetimeArray.tz_convert, True) + to_perioddelta = wrap_array_method(DatetimeArray.to_perioddelta, + False) + to_period = wrap_array_method(DatetimeArray.to_period, True) + normalize = wrap_array_method(DatetimeArray.normalize, True) + to_julian_date = wrap_array_method(DatetimeArray.to_julian_date, + False) + month_name = wrap_array_method(DatetimeArray.month_name, True) + day_name = wrap_array_method(DatetimeArray.day_name, True) + + # -------------------------------------------------------------------- @Substitution(klass='DatetimeIndex') @Appender(_shared_docs['searchsorted']) @@ -1421,8 +1199,7 @@ def insert(self, loc, item): try: new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) - return DatetimeIndex(new_dates, name=self.name, freq=freq, - tz=self.tz) + return self._shallow_copy(new_dates, freq=freq) except (AttributeError, TypeError): # fall back to object index @@ -1458,7 +1235,7 @@ def delete(self, loc): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq - return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) + return self._shallow_copy(new_dates, freq=freq) def indexer_at_time(self, time, asof=False): """ @@ -1547,20 +1324,8 @@ def indexer_between_time(self, start_time, end_time, include_start=True, return mask.nonzero()[0] - def to_julian_date(self): - """ - Convert DatetimeIndex to Float64Index of Julian Dates. - 0 Julian date is noon January 1, 4713 BC. - http://en.wikipedia.org/wiki/Julian_day - """ - result = DatetimeArrayMixin.to_julian_date(self) - return Float64Index(result) - month_name = _wrap_in_index("month_name") - day_name = _wrap_in_index("day_name") - - -DatetimeIndex._add_comparison_methods() +DatetimeIndex._add_comparison_ops() DatetimeIndex._add_numeric_methods_disabled() DatetimeIndex._add_logical_methods_disabled() DatetimeIndex._add_datetimelike_methods() @@ -1651,7 +1416,8 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3) DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', - '2018-04-27 00:00:00'], freq=None) + '2018-04-27 00:00:00'], + dtype='datetime64[ns]', freq=None) **Other Parameters** @@ -1707,9 +1473,13 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, if freq is None and com._any_none(periods, start, end): freq = 'D' - return DatetimeIndex(start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) + result = DatetimeIndex._generate_range( + start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize, + closed=closed, **kwargs) + + result.name = name + return result def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, @@ -1722,37 +1492,39 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, Parameters ---------- start : string or datetime-like, default None - Left bound for generating dates + Left bound for generating dates. end : string or datetime-like, default None - Right bound for generating dates + Right bound for generating dates. periods : integer, default None - Number of periods to generate + Number of periods to generate. freq : string or DateOffset, default 'B' (business daily) - Frequency strings can have multiples, e.g. '5H' + Frequency strings can have multiples, e.g. '5H'. tz : string or None Time zone name for returning localized DatetimeIndex, for example - Asia/Beijing + Asia/Beijing. normalize : bool, default False - Normalize start/end dates to midnight before generating date range + Normalize start/end dates to midnight before generating date range. name : string, default None - Name of the resulting DatetimeIndex + Name of the resulting DatetimeIndex. weekmask : string or None, default None Weekmask of valid business days, passed to ``numpy.busdaycalendar``, only used when custom frequency strings are passed. The default - value None is equivalent to 'Mon Tue Wed Thu Fri' + value None is equivalent to 'Mon Tue Wed Thu Fri'. .. versionadded:: 0.21.0 holidays : list-like or None, default None Dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar``, only used when custom frequency strings - are passed + are passed. .. versionadded:: 0.21.0 closed : string, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). + **kwargs + For compatibility. Has no effect on the result. Notes ----- @@ -1766,7 +1538,16 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, Returns ------- - rng : DatetimeIndex + DatetimeIndex + + Examples + -------- + Note how the two weekend days are skipped in the result. + + >>> pd.bdate_range(start='1/1/2018', end='1/08/2018') + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-08'], + dtype='datetime64[ns]', freq='B') """ if freq is None: msg = 'freq must be specified for bdate_range; use date_range instead' @@ -1784,9 +1565,9 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, 'weekmask are passed, got frequency {freq}').format(freq=freq) raise ValueError(msg) - return DatetimeIndex(start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) + return date_range(start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize, name=name, + closed=closed, **kwargs) def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, @@ -1843,24 +1624,10 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, holidays = kwargs.pop('holidays', []) weekmask = kwargs.pop('weekmask', 'Mon Tue Wed Thu Fri') freq = CDay(holidays=holidays, weekmask=weekmask) - return DatetimeIndex(start=start, end=end, periods=periods, freq=freq, - tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) - - -_CACHE_START = Timestamp(datetime(1950, 1, 1)) -_CACHE_END = Timestamp(datetime(2030, 1, 1)) - -_daterange_cache = {} - -def _naive_in_cache_range(start, end): - if start is None or end is None: - return False - else: - if start.tzinfo is not None or end.tzinfo is not None: - return False - return start > _CACHE_START and end < _CACHE_END + return date_range(start=start, end=end, periods=periods, freq=freq, + tz=tz, normalize=normalize, name=name, + closed=closed, **kwargs) def _time_to_micros(time): diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 289970aaf3a82..46731069d88b8 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -8,10 +8,16 @@ """ +import warnings + import numpy as np -from pandas.core.base import PandasObject + from pandas.util._decorators import deprecate_kwarg + from pandas.core.dtypes.cast import coerce_indexer_dtype + +from pandas.core.base import PandasObject + from pandas.io.formats.printing import pprint_thing @@ -22,15 +28,47 @@ class FrozenList(PandasObject, list): because it's technically non-hashable, will be used for lookups, appropriately, etc. """ - # Sidenote: This has to be of type list, otherwise it messes up PyTables - # typechecks + # Side note: This has to be of type list. Otherwise, + # it messes up PyTables type checks. + + def union(self, other): + """ + Returns a FrozenList with other concatenated to the end of self. - def __add__(self, other): + Parameters + ---------- + other : array-like + The array-like whose elements we are concatenating. + + Returns + ------- + diff : FrozenList + The collection difference between self and other. + """ if isinstance(other, tuple): other = list(other) - return self.__class__(super(FrozenList, self).__add__(other)) + return type(self)(super(FrozenList, self).__add__(other)) + + def difference(self, other): + """ + Returns a FrozenList with elements from other removed from self. - __iadd__ = __add__ + Parameters + ---------- + other : array-like + The array-like whose elements we are removing self. + + Returns + ------- + diff : FrozenList + The collection difference between self and other. + """ + other = set(other) + temp = [x for x in self if x not in other] + return type(self)(temp) + + # TODO: Consider deprecating these in favor of `union` (xref gh-15506) + __add__ = __iadd__ = union # Python 2 compat def __getslice__(self, i, j): @@ -86,6 +124,10 @@ class FrozenNDArray(PandasObject, np.ndarray): # no __array_finalize__ for now because no metadata def __new__(cls, data, dtype=None, copy=False): + warnings.warn("\nFrozenNDArray is deprecated and will be removed in a " + "future version.\nPlease use `numpy.ndarray` instead.\n", + FutureWarning, stacklevel=2) + if copy is None: copy = not isinstance(data, FrozenNDArray) res = np.array(data, dtype=dtype, copy=copy).view(cls) @@ -127,7 +169,7 @@ def searchsorted(self, value, side="left", sorter=None): See Also -------- - numpy.searchsorted : equivalent function + numpy.searchsorted : Equivalent function. """ # We are much more performant if the searched diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 25d4dd0cbcc81..444f9e21b0bdc 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -4,56 +4,46 @@ import numpy as np +from pandas._libs import Timedelta, Timestamp +from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas.compat import add_metaclass -from pandas.core.dtypes.missing import isna +from pandas.util._decorators import Appender, cache_readonly +from pandas.util._doctools import _WritableDoc +from pandas.util._exceptions import rewrite_exception + from pandas.core.dtypes.cast import ( - find_common_type, maybe_downcast_to_dtype, infer_dtype_from_scalar) + find_common_type, infer_dtype_from_scalar, maybe_downcast_to_dtype) from pandas.core.dtypes.common import ( - ensure_platform_int, - is_list_like, - is_datetime_or_timedelta_dtype, - is_datetime64tz_dtype, - is_dtype_equal, - is_integer_dtype, - is_float_dtype, - is_interval_dtype, - is_object_dtype, - is_scalar, - is_float, - is_number, - is_integer) -from pandas.core.indexes.base import ( - Index, ensure_index, - default_pprint, _index_shared_docs) - -from pandas._libs import Timestamp, Timedelta -from pandas._libs.interval import ( - Interval, IntervalMixin, IntervalTree, -) + ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, + is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, + is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) +from pandas.core.dtypes.missing import isna -from pandas.core.indexes.datetimes import date_range, DatetimeIndex -from pandas.core.indexes.timedeltas import timedelta_range, TimedeltaIndex -from pandas.core.indexes.multi import MultiIndex +from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com -from pandas.util._decorators import cache_readonly, Appender -from pandas.util._doctools import _WritableDoc -from pandas.util._exceptions import rewrite_exception from pandas.core.config import get_option +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, _index_shared_docs, default_pprint, ensure_index) +from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.core.ops import get_op_result_name + from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset -import pandas.core.indexes.base as ibase -from pandas.core.arrays.interval import (IntervalArray, - _interval_shared_docs) - _VALID_CLOSED = {'left', 'right', 'both', 'neither'} _index_doc_kwargs = dict(ibase._index_doc_kwargs) + +# TODO(jschendel) remove constructor key when IntervalArray is public (GH22860) _index_doc_kwargs.update( dict(klass='IntervalIndex', + constructor='pd.IntervalIndex', target_klass='IntervalIndex or list of Intervals', name=textwrap.dedent("""\ name : object, optional - to be stored in the index. + Name to be stored in the index. """), )) @@ -114,17 +104,18 @@ def _new_IntervalIndex(cls, d): summary="Immutable index of intervals that are closed on the same side.", name=_index_doc_kwargs['name'], versionadded="0.20.0", + extra_attributes="is_overlapping\n", extra_methods="contains\n", examples=textwrap.dedent("""\ - Examples -------- A new ``IntervalIndex`` is typically constructed using :func:`interval_range`: >>> pd.interval_range(start=0, end=5) - IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]] - closed='right', dtype='interval[int64]') + IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], + closed='right', + dtype='interval[int64]') It may also be constructed using one of the constructor methods: :meth:`IntervalIndex.from_arrays`, @@ -147,6 +138,9 @@ class IntervalIndex(IntervalMixin, Index): # Immutable, so we are able to cache computations like isna in '_mask' _mask = None + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): @@ -178,6 +172,50 @@ def _simple_new(cls, array, name, closed=None): result._reset_identity() return result + @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) + def from_breaks(cls, breaks, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) + def from_arrays(cls, left, right, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays(left, right, closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) + def from_intervals(cls, data, closed=None, name=None, copy=False, + dtype=None): + msg = ('IntervalIndex.from_intervals is deprecated and will be ' + 'removed in a future version; Use IntervalIndex(...) instead') + warnings.warn(msg, FutureWarning, stacklevel=2) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) + + if name is None and isinstance(data, cls): + name = data.name + + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) + def from_tuples(cls, data, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(arr, name=name) + + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, left=None, right=None, **kwargs): result = self._data._shallow_copy(left=left, right=right) @@ -241,48 +279,6 @@ def contains(self, key): except KeyError: return False - @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) - def from_breaks(cls, breaks, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) - def from_arrays(cls, left, right, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_arrays(left, right, closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, - dtype=None): - msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; Use IntervalIndex(...) instead') - warnings.warn(msg, FutureWarning, stacklevel=2) - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) - - if name is None and isinstance(data, cls): - name = data.name - - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) - def from_tuples(cls, data, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(arr, name=name) - @Appender(_interval_shared_docs['to_tuples'] % dict( return_type="Index", examples=""" @@ -474,6 +470,61 @@ def is_unique(self): def is_non_overlapping_monotonic(self): return self._data.is_non_overlapping_monotonic + @property + def is_overlapping(self): + """ + Return True if the IntervalIndex has overlapping intervals, else False. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Returns + ------- + bool + Boolean indicating if the IntervalIndex has overlapping intervals. + + Examples + -------- + >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)]) + >>> index + IntervalIndex([(0, 2], (1, 3], (4, 5]], + closed='right', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that share closed endpoints overlap: + + >>> index = pd.interval_range(0, 3, closed='both') + >>> index + IntervalIndex([[0, 1], [1, 2], [2, 3]], + closed='both', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that only have an open endpoint in common do not overlap: + + >>> index = pd.interval_range(0, 3, closed='left') + >>> index + IntervalIndex([[0, 1), [1, 2), [2, 3)], + closed='left', + dtype='interval[int64]') + >>> index.is_overlapping + False + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + IntervalIndex.overlaps : Check an IntervalIndex elementwise for + overlaps. + """ + # GH 23309 + return self._engine.is_overlapping + @Appender(_index_shared_docs['_convert_scalar_indexer']) def _convert_scalar_indexer(self, key, kind=None): if kind == 'iloc': @@ -580,6 +631,10 @@ def _maybe_convert_i8(self, key): else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) + if key.hasnans: + # convert NaT from it's i8 value to np.nan so it's not viewed + # as a valid value, maybe causing errors (e.g. is_overlapping) + key_i8 = key_i8.where(~key._isnan) # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype @@ -951,6 +1006,8 @@ def __getitem__(self, value): # scalar return result + # -------------------------------------------------------------------- + # Rendering Methods # __repr__ associated methods are based on MultiIndex def _format_with_header(self, header, **kwargs): @@ -1007,6 +1064,8 @@ def _format_space(self): space = ' ' * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) + # -------------------------------------------------------------------- + def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) @@ -1028,8 +1087,12 @@ def equals(self, other): self.right.equals(other.right) and self.closed == other.closed) + @Appender(_interval_shared_docs['overlaps'] % _index_doc_kwargs) + def overlaps(self, other): + return self._data.overlaps(other) + def _setop(op_name): - def func(self, other): + def func(self, other, sort=True): other = self._as_like_interval_index(other) # GH 19016: ensure set op will not return a prohibited dtype @@ -1040,8 +1103,12 @@ def func(self, other): 'objects that have compatible dtypes') raise TypeError(msg.format(op=op_name)) - result = getattr(self._multiindex, op_name)(other._multiindex) - result_name = self.name if self.name == other.name else None + if op_name == 'difference': + result = getattr(self._multiindex, op_name)(other._multiindex, + sort) + else: + result = getattr(self._multiindex, op_name)(other._multiindex) + result_name = get_op_result_name(self, other) # GH 19101: ensure empty results have correct dtype if result.empty: @@ -1053,6 +1120,14 @@ def func(self, other): name=result_name) return func + @property + def is_all_dates(self): + """ + This is False even when left/right contain datetime-like objects, + as the check is done on the Interval itself + """ + return False + union = _setop('union') intersection = _setop('intersection') difference = _setop('difference') @@ -1169,7 +1244,7 @@ def interval_range(start=None, end=None, periods=None, freq=None, See Also -------- - IntervalIndex : an Index of intervals that are all closed on the same side. + IntervalIndex : An Index of intervals that are all closed on the same side. """ start = com.maybe_box_datetimelike(start) end = com.maybe_box_datetimelike(end) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3cccb65503378..5e26a3c6c439e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,48 +1,36 @@ - # pylint: disable=E1101,E1103,W0232 import datetime -import warnings from sys import getsizeof +import warnings import numpy as np -from pandas._libs import algos as libalgos, index as libindex, lib, Timestamp -from pandas._libs import tslibs -from pandas.compat import range, zip, lrange, lzip, map +from pandas._libs import ( + Timestamp, algos as libalgos, index as libindex, lib, tslibs) +import pandas.compat as compat +from pandas.compat import lrange, lzip, map, range, zip from pandas.compat.numpy import function as nv -from pandas import compat +from pandas.errors import PerformanceWarning, UnsortedIndexError +from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.common import ( - ensure_int64, - ensure_platform_int, - is_categorical_dtype, - is_object_dtype, - is_hashable, - is_integer, - is_iterator, - is_list_like, - pandas_dtype, - is_scalar) -from pandas.core.dtypes.missing import isna, array_equivalent -from pandas.errors import PerformanceWarning, UnsortedIndexError + ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, + is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, + pandas_dtype) +from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype +from pandas.core.dtypes.missing import array_equivalent, isna -from pandas.util._decorators import Appender, cache_readonly +import pandas.core.algorithms as algos import pandas.core.common as com +from pandas.core.config import get_option +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, InvalidIndexError, _index_shared_docs, ensure_index) +from pandas.core.indexes.frozen import FrozenList, _ensure_frozen import pandas.core.missing as missing -import pandas.core.algorithms as algos -from pandas.io.formats.printing import pprint_thing -from pandas.core.config import get_option +from pandas.io.formats.printing import pprint_thing -from pandas.core.indexes.base import ( - Index, ensure_index, - InvalidIndexError, - _index_shared_docs) -from pandas.core.indexes.frozen import ( - FrozenNDArray, FrozenList, _ensure_frozen) -import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(klass='MultiIndex', @@ -132,14 +120,21 @@ def _codes_to_ints(self, codes): class MultiIndex(Index): """ - A multi-level, or hierarchical, index object for pandas objects + A multi-level, or hierarchical, index object for pandas objects. Parameters ---------- levels : sequence of arrays The unique labels for each level + codes : sequence of arrays + Integers for each level designating which label at each location + + .. versionadded:: 0.24.0 labels : sequence of arrays Integers for each level designating which label at each location + + .. deprecated:: 0.24.0 + Use ``codes`` instead sortorder : optional int Level of sortedness (must be lexicographically sorted by that level) @@ -148,7 +143,7 @@ class MultiIndex(Index): copy : boolean, default False Copy the meta-data verify_integrity : boolean, default True - Check that the levels/labels are consistent and valid + Check that the levels/codes are consistent and valid Examples --------- @@ -172,17 +167,17 @@ class MultiIndex(Index): See Also -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_product : Create a MultiIndex from the cartesian product - of iterables - MultiIndex.from_tuples : Convert list of tuples to a MultiIndex - Index : The base pandas Index type + of iterables. + MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. + Index : The base pandas Index type. Attributes ---------- names levels - labels + codes nlevels levshape @@ -192,8 +187,9 @@ class MultiIndex(Index): from_tuples from_product set_levels - set_labels + set_codes to_frame + to_flat_index is_lexsorted sortlevel droplevel @@ -206,29 +202,33 @@ class MultiIndex(Index): _typ = 'multiindex' _names = FrozenList() _levels = FrozenList() - _labels = FrozenList() + _codes = FrozenList() _comparables = ['names'] rename = Index.set_names - def __new__(cls, levels=None, labels=None, sortorder=None, names=None, + # -------------------------------------------------------------------- + # Constructors + + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def __new__(cls, levels=None, codes=None, sortorder=None, names=None, dtype=None, copy=False, name=None, verify_integrity=True, _set_identity=True): # compat with Index if name is not None: names = name - if levels is None or labels is None: - raise TypeError("Must pass both levels and labels") - if len(levels) != len(labels): - raise ValueError('Length of levels and labels must be the same.') + if levels is None or codes is None: + raise TypeError("Must pass both levels and codes") + if len(levels) != len(codes): + raise ValueError('Length of levels and codes must be the same.') if len(levels) == 0: - raise ValueError('Must pass non-zero number of levels/labels') + raise ValueError('Must pass non-zero number of levels/codes') result = object.__new__(MultiIndex) - # we've already validated levels and labels, so shortcut here + # we've already validated levels and codes, so shortcut here result._set_levels(levels, copy=copy, validate=False) - result._set_labels(labels, copy=copy, validate=False) + result._set_codes(codes, copy=copy, validate=False) if names is not None: # handles name validation @@ -245,39 +245,39 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, result._reset_identity() return result - def _verify_integrity(self, labels=None, levels=None): + def _verify_integrity(self, codes=None, levels=None): """ Parameters ---------- - labels : optional list - Labels to check for validity. Defaults to current labels. + codes : optional list + Codes to check for validity. Defaults to current codes. levels : optional list Levels to check for validity. Defaults to current levels. Raises ------ ValueError - If length of levels and labels don't match, if any label would - exceed level bounds, or there are any duplicate levels. + If length of levels and codes don't match, if the codes for any + level would exceed level bounds, or there are any duplicate levels. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. - labels = labels or self.labels + codes = codes or self.codes levels = levels or self.levels - if len(levels) != len(labels): - raise ValueError("Length of levels and labels must match. NOTE:" + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - label_length = len(self.labels[0]) - for i, (level, label) in enumerate(zip(levels, labels)): - if len(label) != label_length: - raise ValueError("Unequal label lengths: %s" % - ([len(lab) for lab in labels])) - if len(label) and label.max() >= len(level): - raise ValueError("On level %d, label max (%d) >= length of" + codes_length = len(self.codes[0]) + for i, (level, level_codes) in enumerate(zip(levels, codes)): + if len(level_codes) != codes_length: + raise ValueError("Unequal code lengths: %s" % + ([len(code_) for code_ in codes])) + if len(level_codes) and level_codes.max() >= len(level): + raise ValueError("On level %d, code max (%d) >= length of" " level (%d). NOTE: this index is in an" - " inconsistent state" % (i, label.max(), + " inconsistent state" % (i, level_codes.max(), len(level))) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " @@ -285,10 +285,177 @@ def _verify_integrity(self, labels=None, levels=None): values=[value for value in level], level=i)) + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=None): + """ + Convert arrays to MultiIndex + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + """ + if not is_list_like(arrays): + raise TypeError("Input must be a list / sequence of array-likes.") + elif is_iterator(arrays): + arrays = list(arrays) + + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError('all arrays must be same length') + + from pandas.core.arrays.categorical import _factorize_from_iterables + + codes, levels = _factorize_from_iterables(arrays) + if names is None: + names = [getattr(arr, "name", None) for arr in arrays] + + return MultiIndex(levels=levels, codes=codes, sortorder=sortorder, + names=names, verify_integrity=False) + + @classmethod + def from_tuples(cls, tuples, sortorder=None, names=None): + """ + Convert list of tuples to MultiIndex + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> tuples = [(1, u'red'), (1, u'blue'), + (2, u'red'), (2, u'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables + """ + if not is_list_like(tuples): + raise TypeError('Input must be a list / sequence of tuple-likes.') + elif is_iterator(tuples): + tuples = list(tuples) + + if len(tuples) == 0: + if names is None: + msg = 'Cannot infer number of levels from empty list' + raise TypeError(msg) + arrays = [[]] * len(names) + elif isinstance(tuples, (np.ndarray, Index)): + if isinstance(tuples, Index): + tuples = tuples._values + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrays = lzip(*tuples) + + return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + + @classmethod + def from_product(cls, iterables, sortorder=None, names=None): + """ + Make a MultiIndex from the cartesian product of multiple iterables + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of strings or None + Names for the levels in the index. + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = [u'green', u'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=[u'number', u'color']) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + """ + from pandas.core.arrays.categorical import _factorize_from_iterables + from pandas.core.reshape.util import cartesian_product + + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) + + codes, levels = _factorize_from_iterables(iterables) + codes = cartesian_product(codes) + return MultiIndex(levels, codes, sortorder=sortorder, names=names) + + # -------------------------------------------------------------------- + @property def levels(self): return self._levels + @property + def _values(self): + # We override here, since our parent uses _data, which we dont' use. + return self.values + + @property + def array(self): + """ + Raises a ValueError for `MultiIndex` because there's no single + array backing a MultiIndex. + + Raises + ------ + ValueError + """ + msg = ("MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples.") + raise ValueError(msg) + @property def _is_homogeneous_type(self): """Whether the levels of a MultiIndex all have the same dtype. @@ -360,13 +527,12 @@ def set_levels(self, levels, level=None, inplace=False, inplace : bool if True, mutates in place verify_integrity : bool (default True) - if True, checks that levels and labels are compatible + if True, checks that levels and codes are compatible Returns ------- new index (of same type and class...etc) - Examples -------- >>> idx = pd.MultiIndex.from_tuples([(1, u'one'), (1, u'two'), @@ -389,6 +555,9 @@ def set_levels(self, levels, level=None, inplace=False, labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[u'foo', u'bar']) """ + if is_list_like(levels) and not isinstance(levels, Index): + levels = list(levels) + if level is not None and not is_list_like(level): if not is_list_like(levels): raise TypeError("Levels must be list-like") @@ -410,54 +579,74 @@ def set_levels(self, levels, level=None, inplace=False, if not inplace: return idx + @property + def codes(self): + return self._codes + @property def labels(self): - return self._labels + warnings.warn((".labels was deprecated in version 0.24.0. " + "Use .codes instead."), + FutureWarning, stacklevel=2) + return self.codes - def _set_labels(self, labels, level=None, copy=False, validate=True, - verify_integrity=False): + def _set_codes(self, codes, level=None, copy=False, validate=True, + verify_integrity=False): - if validate and level is None and len(labels) != self.nlevels: - raise ValueError("Length of labels must match number of levels") - if validate and level is not None and len(labels) != len(level): - raise ValueError('Length of labels must match length of levels.') + if validate and level is None and len(codes) != self.nlevels: + raise ValueError("Length of codes must match number of levels") + if validate and level is not None and len(codes) != len(level): + raise ValueError('Length of codes must match length of levels.') if level is None: - new_labels = FrozenList( - _ensure_frozen(lab, lev, copy=copy)._shallow_copy() - for lev, lab in zip(self.levels, labels)) + new_codes = FrozenList( + _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() + for lev, level_codes in zip(self.levels, codes)) else: level = [self._get_level_number(l) for l in level] - new_labels = list(self._labels) - for lev_idx, lab in zip(level, labels): + new_codes = list(self._codes) + for lev_idx, level_codes in zip(level, codes): lev = self.levels[lev_idx] - new_labels[lev_idx] = _ensure_frozen( - lab, lev, copy=copy)._shallow_copy() - new_labels = FrozenList(new_labels) + new_codes[lev_idx] = _ensure_frozen( + level_codes, lev, copy=copy)._shallow_copy() + new_codes = FrozenList(new_codes) if verify_integrity: - self._verify_integrity(labels=new_labels) + self._verify_integrity(codes=new_codes) - self._labels = new_labels + self._codes = new_codes self._tuples = None self._reset_cache() def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): + warnings.warn((".set_labels was deprecated in version 0.24.0. " + "Use .set_codes instead."), + FutureWarning, stacklevel=2) + return self.set_codes(codes=labels, level=level, inplace=inplace, + verify_integrity=verify_integrity) + + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def set_codes(self, codes, level=None, inplace=False, + verify_integrity=True): """ - Set new labels on MultiIndex. Defaults to returning + Set new codes on MultiIndex. Defaults to returning new index. + .. versionadded:: 0.24.0 + + New name for deprecated method `set_labels`. + Parameters ---------- - labels : sequence or list of sequence - new labels to apply + codes : sequence or list of sequence + new codes to apply level : int, level name, or sequence of int/level names (default None) level(s) to set (None for all levels) inplace : bool if True, mutates in place verify_integrity : bool (default True) - if True, checks that levels and labels are compatible + if True, checks that levels and codes are compatible Returns ------- @@ -468,47 +657,48 @@ def set_labels(self, labels, level=None, inplace=False, >>> idx = pd.MultiIndex.from_tuples([(1, u'one'), (1, u'two'), (2, u'one'), (2, u'two')], names=['foo', 'bar']) - >>> idx.set_labels([[1,0,1,0], [0,0,1,1]]) + >>> idx.set_codes([[1,0,1,0], [0,0,1,1]]) MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[1, 0, 1, 0], [0, 0, 1, 1]], names=[u'foo', u'bar']) - >>> idx.set_labels([1,0,1,0], level=0) + >>> idx.set_codes([1,0,1,0], level=0) MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[1, 0, 1, 0], [0, 1, 0, 1]], names=[u'foo', u'bar']) - >>> idx.set_labels([0,0,1,1], level='bar') + >>> idx.set_codes([0,0,1,1], level='bar') MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[0, 0, 1, 1], [0, 0, 1, 1]], names=[u'foo', u'bar']) - >>> idx.set_labels([[1,0,1,0], [0,0,1,1]], level=[0,1]) + >>> idx.set_codes([[1,0,1,0], [0,0,1,1]], level=[0,1]) MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[1, 0, 1, 0], [0, 0, 1, 1]], names=[u'foo', u'bar']) """ if level is not None and not is_list_like(level): - if not is_list_like(labels): - raise TypeError("Labels must be list-like") - if is_list_like(labels[0]): - raise TypeError("Labels must be list-like") + if not is_list_like(codes): + raise TypeError("Codes must be list-like") + if is_list_like(codes[0]): + raise TypeError("Codes must be list-like") level = [level] - labels = [labels] + codes = [codes] elif level is None or is_list_like(level): - if not is_list_like(labels) or not is_list_like(labels[0]): - raise TypeError("Labels must be list of lists-like") + if not is_list_like(codes) or not is_list_like(codes[0]): + raise TypeError("Codes must be list of lists-like") if inplace: idx = self else: idx = self._shallow_copy() idx._reset_identity() - idx._set_labels(labels, level=level, verify_integrity=verify_integrity) + idx._set_codes(codes, level=level, verify_integrity=verify_integrity) if not inplace: return idx - def copy(self, names=None, dtype=None, levels=None, labels=None, + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def copy(self, names=None, dtype=None, levels=None, codes=None, deep=False, _set_identity=False, **kwargs): """ - Make a copy of this object. Names, dtype, levels and labels can be + Make a copy of this object. Names, dtype, levels and codes can be passed and will be set on new copy. Parameters @@ -516,7 +706,7 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, names : sequence, optional dtype : numpy dtype or pandas type, optional levels : sequence, optional - labels : sequence, optional + codes : sequence, optional Returns ------- @@ -535,14 +725,14 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, from copy import deepcopy if levels is None: levels = deepcopy(self.levels) - if labels is None: - labels = deepcopy(self.labels) + if codes is None: + codes = deepcopy(self.codes) else: if levels is None: levels = self.levels - if labels is None: - labels = self.labels - return MultiIndex(levels=levels, labels=labels, names=names, + if codes is None: + codes = self.codes + return MultiIndex(levels=levels, codes=codes, names=names, sortorder=self.sortorder, verify_integrity=False, _set_identity=_set_identity) @@ -561,7 +751,7 @@ def _shallow_copy_with_infer(self, values, **kwargs): # Therefore, an empty MultiIndex is returned GH13490 if len(values) == 0: return MultiIndex(levels=[[] for _ in range(self.nlevels)], - labels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], **kwargs) return self._shallow_copy(values, **kwargs) @@ -622,7 +812,7 @@ def _nbytes(self, deep=False): objsize = 24 level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels) - label_nbytes = sum(i.nbytes for i in self.labels) + label_nbytes = sum(i.nbytes for i in self.codes) names_nbytes = sum(getsizeof(i, objsize) for i in self.names) result = level_nbytes + label_nbytes + names_nbytes @@ -630,6 +820,9 @@ def _nbytes(self, deep=False): result += self._engine.sizeof(deep=deep) return result + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) @@ -637,7 +830,7 @@ def _format_attrs(self): attrs = [ ('levels', ibase.default_pprint(self._levels, max_seq_items=False)), - ('labels', ibase.default_pprint(self._labels, + ('labels', ibase.default_pprint(self._codes, max_seq_items=False))] if com._any_not_none(*self.names): attrs.append(('names', ibase.default_pprint(self.names))) @@ -652,8 +845,96 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_native_types(self, na_rep='nan', **kwargs): + new_levels = [] + new_codes = [] + + # go through the levels and format them + for level, level_codes in zip(self.levels, self.codes): + level = level._format_native_types(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = (level_codes == -1) + if mask.any(): + nan_index = len(level) + level = np.append(level, na_rep) + level_codes = level_codes.values() + level_codes[mask] = nan_index + new_levels.append(level) + new_codes.append(level_codes) + + if len(new_levels) == 1: + return Index(new_levels[0])._format_native_types() + else: + # reconstruct the multi-index + mi = MultiIndex(levels=new_levels, codes=new_codes, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) + return mi.values + + def format(self, space=2, sparsify=None, adjoin=True, names=False, + na_rep=None, formatter=None): + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) + + if len(lev) > 0: + + formatted = lev.take(level_codes).format(formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [pprint_thing(na if isna(x) else x, + escape_chars=('\t', '\r', '\n')) + for x in algos.take_1d(lev._values, level_codes)] + stringified_levels.append(formatted) + + result_levels = [] + for lev, name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append(pprint_thing(name, + escape_chars=('\t', '\r', '\n')) + if name is not None else '') + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel = '' + # GH3547 + # use value of sparsify as sentinel, unless it's an obvious + # "Truthey" value + if sparsify not in [True, 1]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = _sparsify(result_levels, start=int(names), + sentinel=sentinel) + + if adjoin: + from pandas.io.formats.format import _get_adjustment + adj = _get_adjustment() + return adj.adjoin(space, *result_levels).split('\n') + else: + return result_levels + + # -------------------------------------------------------------------- + def __len__(self): - return len(self.labels[0]) + return len(self.codes[0]) def _get_names(self): return FrozenList(level.name for level in self.levels) @@ -713,32 +994,9 @@ def _set_names(self, names, level=None, validate=True): names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") - def _format_native_types(self, na_rep='nan', **kwargs): - new_levels = [] - new_labels = [] - - # go through the levels and format them - for level, label in zip(self.levels, self.labels): - level = level._format_native_types(na_rep=na_rep, **kwargs) - # add nan values, if there are any - mask = (label == -1) - if mask.any(): - nan_index = len(level) - level = np.append(level, na_rep) - label = label.values() - label[mask] = nan_index - new_levels.append(level) - new_labels.append(label) - - # reconstruct the multi-index - mi = MultiIndex(levels=new_levels, labels=new_labels, names=self.names, - sortorder=self.sortorder, verify_integrity=False) - - return mi.values - @Appender(_index_shared_docs['_get_grouper_for_level']) def _get_grouper_for_level(self, mapper, level): - indexer = self.labels[level] + indexer = self.codes[level] level_index = self.levels[level] if mapper is not None: @@ -747,25 +1005,24 @@ def _get_grouper_for_level(self, mapper, level): grouper = level_values.map(mapper) return grouper, None, None - labels, uniques = algos.factorize(indexer, sort=True) + codes, uniques = algos.factorize(indexer, sort=True) if len(uniques) > 0 and uniques[0] == -1: # Handle NAs mask = indexer != -1 - ok_labels, uniques = algos.factorize(indexer[mask], - sort=True) + ok_codes, uniques = algos.factorize(indexer[mask], sort=True) - labels = np.empty(len(indexer), dtype=indexer.dtype) - labels[mask] = ok_labels - labels[~mask] = -1 + codes = np.empty(len(indexer), dtype=indexer.dtype) + codes[mask] = ok_codes + codes[~mask] = -1 if len(uniques) < len(level_index): # Remove unobserved levels from level_index level_index = level_index.take(uniques) - grouper = level_index.take(labels) + grouper = level_index.take(codes) - return grouper, labels, level_index + return grouper, codes, level_index @property def _constructor(self): @@ -819,8 +1076,8 @@ def _engine(self): # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: - return MultiIndexPyIntEngine(self.levels, self.labels, offsets) - return MultiIndexUIntEngine(self.levels, self.labels, offsets) + return MultiIndexPyIntEngine(self.levels, self.codes, offsets) + return MultiIndexUIntEngine(self.levels, self.codes, offsets) @property def values(self): @@ -931,7 +1188,7 @@ def duplicated(self, keep='first'): from pandas._libs.hashtable import duplicated_int64 shape = map(len, self.levels) - ids = get_group_index(self.labels, shape, sort=False, xnull=False) + ids = get_group_index(self.codes, shape, sort=False, xnull=False) return duplicated_int64(ids, keep) @@ -943,7 +1200,7 @@ def fillna(self, value=None, downcast=None): @Appender(_index_shared_docs['dropna']) def dropna(self, how='any'): - nans = [label == -1 for label in self.labels] + nans = [level_codes == -1 for level_codes in self.codes] if how == 'any': indexer = np.any(nans, axis=0) elif how == 'all': @@ -951,8 +1208,8 @@ def dropna(self, how='any'): else: raise ValueError("invalid how option: {0}".format(how)) - new_labels = [label[~indexer] for label in self.labels] - return self.copy(labels=new_labels, deep=True) + new_codes = [level_codes[~indexer] for level_codes in self.codes] + return self.copy(codes=new_codes, deep=True) def get_value(self, series, key): # somewhat broken encapsulation @@ -1033,10 +1290,10 @@ def _get_level_values(self, level, unique=False): """ values = self.levels[level] - labels = self.labels[level] + level_codes = self.codes[level] if unique: - labels = algos.unique(labels) - filled = algos.take_1d(values._values, labels, + level_codes = algos.unique(level_codes) + filled = algos.take_1d(values._values, level_codes, fill_value=values._na_value) values = values._shallow_copy(filled) return values @@ -1086,66 +1343,6 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def format(self, space=2, sparsify=None, adjoin=True, names=False, - na_rep=None, formatter=None): - if len(self) == 0: - return [] - - stringified_levels = [] - for lev, lab in zip(self.levels, self.labels): - na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) - - if len(lev) > 0: - - formatted = lev.take(lab).format(formatter=formatter) - - # we have some NA - mask = lab == -1 - if mask.any(): - formatted = np.array(formatted, dtype=object) - formatted[mask] = na - formatted = formatted.tolist() - - else: - # weird all NA case - formatted = [pprint_thing(na if isna(x) else x, - escape_chars=('\t', '\r', '\n')) - for x in algos.take_1d(lev._values, lab)] - stringified_levels.append(formatted) - - result_levels = [] - for lev, name in zip(stringified_levels, self.names): - level = [] - - if names: - level.append(pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - if name is not None else '') - - level.extend(np.array(lev, dtype=object)) - result_levels.append(level) - - if sparsify is None: - sparsify = get_option("display.multi_sparse") - - if sparsify: - sentinel = '' - # GH3547 - # use value of sparsify as sentinel, unless it's an obvious - # "Truthey" value - if sparsify not in [True, 1]: - sentinel = sparsify - # little bit of a kludge job for #1217 - result_levels = _sparsify(result_levels, start=int(names), - sentinel=sentinel) - - if adjoin: - from pandas.io.formats.format import _get_adjustment - adj = _get_adjustment() - return adj.adjoin(space, *result_levels).split('\n') - else: - return result_levels - def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) @@ -1171,7 +1368,7 @@ def to_frame(self, index=True, name=None): ------- DataFrame : a DataFrame containing the original MultiIndex data. - See also + See Also -------- DataFrame """ @@ -1200,14 +1397,14 @@ def to_frame(self, index=True, name=None): def to_hierarchical(self, n_repeat, n_shuffle=1): """ - .. deprecated:: 0.24.0 - Return a MultiIndex reshaped to conform to the shapes given by n_repeat and n_shuffle. Useful to replicate and rearrange a MultiIndex for combination with another Index with n_repeat items. + .. deprecated:: 0.24.0 + Parameters ---------- n_repeat : int @@ -1232,14 +1429,43 @@ def to_hierarchical(self, n_repeat, n_shuffle=1): [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) """ levels = self.levels - labels = [np.repeat(x, n_repeat) for x in self.labels] - # Assumes that each label is divisible by n_shuffle - labels = [x.reshape(n_shuffle, -1).ravel(order='F') for x in labels] + codes = [np.repeat(level_codes, n_repeat) for + level_codes in self.codes] + # Assumes that each level_codes is divisible by n_shuffle + codes = [x.reshape(n_shuffle, -1).ravel(order='F') for x in codes] names = self.names warnings.warn("Method .to_hierarchical is deprecated and will " "be removed in a future version", FutureWarning, stacklevel=2) - return MultiIndex(levels=levels, labels=labels, names=names) + return MultiIndex(levels=levels, codes=codes, names=names) + + def to_flat_index(self): + """ + Convert a MultiIndex to an Index of Tuples containing the level values. + + .. versionadded:: 0.24.0 + + Returns + ------- + pd.Index + Index with the MultiIndex data represented in Tuples. + + Notes + ----- + This method will simply return the caller if called by anything other + than a MultiIndex. + + Examples + -------- + >>> index = pd.MultiIndex.from_product( + ... [['foo', 'bar'], ['baz', 'qux']], + ... names=['a', 'b']) + >>> index.to_flat_index() + Index([('foo', 'baz'), ('foo', 'qux'), + ('bar', 'baz'), ('bar', 'qux')], + dtype='object') + """ + return Index(self.values, tupleize_cols=False) @property def is_all_dates(self): @@ -1247,7 +1473,7 @@ def is_all_dates(self): def is_lexsorted(self): """ - Return True if the labels are lexicographically sorted + Return True if the codes are lexicographically sorted """ return self.lexsort_depth == self.nlevels @@ -1259,159 +1485,13 @@ def lexsort_depth(self): else: return 0 - int64_labels = [ensure_int64(lab) for lab in self.labels] + int64_codes = [ensure_int64(level_codes) for level_codes in self.codes] for k in range(self.nlevels, 0, -1): - if libalgos.is_lexsorted(int64_labels[:k]): + if libalgos.is_lexsorted(int64_codes[:k]): return k return 0 - @classmethod - def from_arrays(cls, arrays, sortorder=None, names=None): - """ - Convert arrays to MultiIndex - - Parameters - ---------- - arrays : list / sequence of array-likes - Each array-like gives one level's value for each data point. - len(arrays) is the number of levels. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_tuples : Convert list of tuples to MultiIndex - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables - """ - if not is_list_like(arrays): - raise TypeError("Input must be a list / sequence of array-likes.") - elif is_iterator(arrays): - arrays = list(arrays) - - # Check if lengths of all arrays are equal or not, - # raise ValueError, if not - for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError('all arrays must be same length') - - from pandas.core.arrays.categorical import _factorize_from_iterables - - labels, levels = _factorize_from_iterables(arrays) - if names is None: - names = [getattr(arr, "name", None) for arr in arrays] - - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names, verify_integrity=False) - - @classmethod - def from_tuples(cls, tuples, sortorder=None, names=None): - """ - Convert list of tuples to MultiIndex - - Parameters - ---------- - tuples : list / sequence of tuple-likes - Each tuple is the index of one row/column. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> tuples = [(1, u'red'), (1, u'blue'), - (2, u'red'), (2, u'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables - """ - if not is_list_like(tuples): - raise TypeError('Input must be a list / sequence of tuple-likes.') - elif is_iterator(tuples): - tuples = list(tuples) - - if len(tuples) == 0: - if names is None: - msg = 'Cannot infer number of levels from empty list' - raise TypeError(msg) - arrays = [[]] * len(names) - elif isinstance(tuples, (np.ndarray, Index)): - if isinstance(tuples, Index): - tuples = tuples._values - - arrays = list(lib.tuples_to_object_array(tuples).T) - elif isinstance(tuples, list): - arrays = list(lib.to_object_array_tuples(tuples).T) - else: - arrays = lzip(*tuples) - - return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) - - @classmethod - def from_product(cls, iterables, sortorder=None, names=None): - """ - Make a MultiIndex from the cartesian product of multiple iterables - - Parameters - ---------- - iterables : list / sequence of iterables - Each iterable has unique labels for each level of the index. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of strings or None - Names for the levels in the index. - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> numbers = [0, 1, 2] - >>> colors = [u'green', u'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - names=['number', 'color']) - MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=[u'number', u'color']) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex - MultiIndex.from_tuples : Convert list of tuples to MultiIndex - """ - from pandas.core.arrays.categorical import _factorize_from_iterables - from pandas.core.reshape.util import cartesian_product - - if not is_list_like(iterables): - raise TypeError("Input must be a list / sequence of iterables.") - elif is_iterator(iterables): - iterables = list(iterables) - - labels, levels = _factorize_from_iterables(iterables) - labels = cartesian_product(labels) - return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def _sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 @@ -1434,7 +1514,7 @@ def _sort_levels_monotonic(self): -------- >>> i = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> i MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) @@ -1449,9 +1529,9 @@ def _sort_levels_monotonic(self): return self new_levels = [] - new_labels = [] + new_codes = [] - for lev, lab in zip(self.levels, self.labels): + for lev, level_codes in zip(self.levels, self.codes): if not lev.is_monotonic: try: @@ -1462,15 +1542,15 @@ def _sort_levels_monotonic(self): else: lev = lev.take(indexer) - # indexer to reorder the labels + # indexer to reorder the level codes indexer = ensure_int64(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) - lab = algos.take_1d(ri, lab) + level_codes = algos.take_1d(ri, level_codes) new_levels.append(lev) - new_labels.append(lab) + new_codes.append(level_codes) - return MultiIndex(new_levels, new_labels, + return MultiIndex(new_levels, new_codes, names=self.names, sortorder=self.sortorder, verify_integrity=False) @@ -1495,7 +1575,6 @@ def remove_unused_levels(self): MultiIndex(levels=[[0, 1], ['a', 'b']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) - >>> i[2:] MultiIndex(levels=[[0, 1], ['a', 'b']], labels=[[1, 1], [0, 1]]) @@ -1506,19 +1585,18 @@ def remove_unused_levels(self): >>> i[2:].remove_unused_levels() MultiIndex(levels=[[1], ['a', 'b']], labels=[[0, 0], [0, 1]]) - """ new_levels = [] - new_labels = [] + new_codes = [] changed = False - for lev, lab in zip(self.levels, self.labels): + for lev, level_codes in zip(self.levels, self.codes): # Since few levels are typically unused, bincount() is more # efficient than unique() - however it only accepts positive values # (and drops order): - uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1 + uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1 has_na = int(len(uniques) and (uniques[0] == -1)) if len(uniques) != len(lev) + has_na: @@ -1527,33 +1605,34 @@ def remove_unused_levels(self): # Recalculate uniques, now preserving order. # Can easily be cythonized by exploiting the already existing - # "uniques" and stop parsing "lab" when all items are found: - uniques = algos.unique(lab) + # "uniques" and stop parsing "level_codes" when all items + # are found: + uniques = algos.unique(level_codes) if has_na: na_idx = np.where(uniques == -1)[0] # Just ensure that -1 is in first position: uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] - # labels get mapped from uniques to 0:len(uniques) + # codes get mapped from uniques to 0:len(uniques) # -1 (if present) is mapped to last position - label_mapping = np.zeros(len(lev) + has_na) + code_mapping = np.zeros(len(lev) + has_na) # ... and reassigned value -1: - label_mapping[uniques] = np.arange(len(uniques)) - has_na + code_mapping[uniques] = np.arange(len(uniques)) - has_na - lab = label_mapping[lab] + level_codes = code_mapping[level_codes] # new levels are simple lev = lev.take(uniques[has_na:]) new_levels.append(lev) - new_labels.append(lab) + new_codes.append(level_codes) result = self._shallow_copy() if changed: result._reset_identity() result._set_levels(new_levels, validate=False) - result._set_labels(new_labels, validate=False) + result._set_codes(new_codes, validate=False) return result @@ -1570,7 +1649,7 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], - labels=[label for label in self.labels], + codes=[level_codes for level_codes in self.codes], sortorder=self.sortorder, names=list(self.names)) return ibase._new_Index, (self.__class__, d), None @@ -1579,17 +1658,17 @@ def __setstate__(self, state): if isinstance(state, dict): levels = state.get('levels') - labels = state.get('labels') + codes = state.get('codes') sortorder = state.get('sortorder') names = state.get('names') elif isinstance(state, tuple): nd_state, own_state = state - levels, labels, sortorder, names = own_state + levels, codes, sortorder, names = own_state self._set_levels([Index(x) for x in levels], validate=False) - self._set_labels(labels) + self._set_codes(codes) self._set_names(names) self.sortorder = sortorder self._verify_integrity() @@ -1600,16 +1679,16 @@ def __getitem__(self, key): key = com.cast_scalar_indexer(key) retval = [] - for lev, lab in zip(self.levels, self.labels): - if lab[key] == -1: + for lev, level_codes in zip(self.levels, self.codes): + if level_codes[key] == -1: retval.append(np.nan) else: - retval.append(lev[lab[key]]) + retval.append(lev[level_codes[key]]) return tuple(retval) else: if com.is_bool_indexer(key): - key = np.asarray(key) + key = np.asarray(key, dtype=bool) sortorder = self.sortorder else: # cannot be sure whether the result will be sorted @@ -1618,9 +1697,9 @@ def __getitem__(self, key): if isinstance(key, Index): key = np.asarray(key) - new_labels = [lab[key] for lab in self.labels] + new_codes = [level_codes[key] for level_codes in self.codes] - return MultiIndex(levels=self.levels, labels=new_labels, + return MultiIndex(levels=self.levels, codes=new_codes, names=self.names, sortorder=sortorder, verify_integrity=False) @@ -1629,11 +1708,11 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) - taken = self._assert_take_fillable(self.labels, indices, + taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, na_value=-1) - return MultiIndex(levels=self.levels, labels=taken, + return MultiIndex(levels=self.levels, codes=taken, names=self.names, verify_integrity=False) def _assert_take_fillable(self, values, indices, allow_fill=True, @@ -1645,17 +1724,17 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) - taken = [lab.take(indices) for lab in self.labels] + taken = [lab.take(indices) for lab in self.codes] mask = indices == -1 if mask.any(): masked = [] for new_label in taken: label_values = new_label.values() label_values[mask] = na_value - masked.append(FrozenNDArray(label_values)) + masked.append(np.asarray(label_values)) taken = masked else: - taken = [lab.take(indices) for lab in self.labels] + taken = [lab.take(indices) for lab in self.codes] return taken def append(self, other): @@ -1697,21 +1776,23 @@ def argsort(self, *args, **kwargs): def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return MultiIndex(levels=self.levels, - labels=[label.view(np.ndarray).repeat(repeats) - for label in self.labels], names=self.names, - sortorder=self.sortorder, verify_integrity=False) + codes=[level_codes.view(np.ndarray).repeat(repeats) + for level_codes in self.codes], + names=self.names, sortorder=self.sortorder, + verify_integrity=False) def where(self, cond, other=None): raise NotImplementedError(".where is not supported for " "MultiIndex operations") - def drop(self, labels, level=None, errors='raise'): + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def drop(self, codes, level=None, errors='raise'): """ - Make new MultiIndex with passed list of labels deleted + Make new MultiIndex with passed list of codes deleted Parameters ---------- - labels : array-like + codes : array-like Must be a list of tuples level : int or level name, default None @@ -1720,24 +1801,24 @@ def drop(self, labels, level=None, errors='raise'): dropped : MultiIndex """ if level is not None: - return self._drop_from_level(labels, level) + return self._drop_from_level(codes, level) try: - if not isinstance(labels, (np.ndarray, Index)): - labels = com.index_labels_to_array(labels) - indexer = self.get_indexer(labels) + if not isinstance(codes, (np.ndarray, Index)): + codes = com.index_labels_to_array(codes) + indexer = self.get_indexer(codes) mask = indexer == -1 if mask.any(): if errors != 'ignore': - raise ValueError('labels %s not contained in axis' % - labels[mask]) + raise ValueError('codes %s not contained in axis' % + codes[mask]) except Exception: pass inds = [] - for label in labels: + for level_codes in codes: try: - loc = self.get_loc(label) + loc = self.get_loc(level_codes) # get_loc returns either an integer, a slice, or a boolean # mask if isinstance(loc, int): @@ -1762,13 +1843,13 @@ def drop(self, labels, level=None, errors='raise'): return self.delete(inds) - def _drop_from_level(self, labels, level): - labels = com.index_labels_to_array(labels) + def _drop_from_level(self, codes, level): + codes = com.index_labels_to_array(codes) i = self._get_level_number(level) index = self.levels[i] - values = index.get_indexer(labels) + values = index.get_indexer(codes) - mask = ~algos.isin(self.labels[i], values) + mask = ~algos.isin(self.codes[i], values) return self[mask] @@ -1799,14 +1880,14 @@ def swaplevel(self, i=-2, j=-1): See Also -------- - Series.swaplevel : Swap levels i and j in a MultiIndex + Series.swaplevel : Swap levels i and j in a MultiIndex. Dataframe.swaplevel : Swap levels i and j in a MultiIndex on a - particular axis + particular axis. Examples -------- >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> mi MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) @@ -1815,17 +1896,17 @@ def swaplevel(self, i=-2, j=-1): labels=[[0, 1, 0, 1], [0, 0, 1, 1]]) """ new_levels = list(self.levels) - new_labels = list(self.labels) + new_codes = list(self.codes) new_names = list(self.names) i = self._get_level_number(i) j = self._get_level_number(j) new_levels[i], new_levels[j] = new_levels[j], new_levels[i] - new_labels[i], new_labels[j] = new_labels[j], new_labels[i] + new_codes[i], new_codes[j] = new_codes[j], new_codes[i] new_names[i], new_names[j] = new_names[j], new_names[i] - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) def reorder_levels(self, order): @@ -1841,31 +1922,33 @@ def reorder_levels(self, order): 'number of levels (%d), got %d' % (self.nlevels, len(order))) new_levels = [self.levels[i] for i in order] - new_labels = [self.labels[i] for i in order] + new_codes = [self.codes[i] for i in order] new_names = [self.names[i] for i in order] - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) def __getslice__(self, i, j): return self.__getitem__(slice(i, j)) - def _get_labels_for_sorting(self): + def _get_codes_for_sorting(self): """ - we categorizing our labels by using the - available catgories (all, not just observed) + we categorizing our codes by using the + available categories (all, not just observed) excluding any missing ones (-1); this is in preparation for sorting, where we need to disambiguate that -1 is not a valid valid """ from pandas.core.arrays import Categorical - def cats(label): - return np.arange(np.array(label).max() + 1 if len(label) else 0, - dtype=label.dtype) + def cats(level_codes): + return np.arange(np.array(level_codes).max() + 1 if + len(level_codes) else 0, + dtype=level_codes.dtype) - return [Categorical.from_codes(label, cats(label), ordered=True) - for label in self.labels] + return [Categorical.from_codes(level_codes, cats(level_codes), + ordered=True) + for level_codes in self.codes] def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ @@ -1880,7 +1963,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): ascending : boolean, default True False to sort in descending order Can also be a list to specify a directed ordering - sort_remaining : sort by the remaining levels after level. + sort_remaining : sort by the remaining levels after level Returns ------- @@ -1888,7 +1971,6 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Resulting index indexer : np.ndarray Indices of output values in original index - """ from pandas.core.sorting import indexer_from_factorized @@ -1903,21 +1985,21 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): raise ValueError("level must have same length as ascending") from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer([self.labels[lev] for lev in level], + indexer = lexsort_indexer([self.codes[lev] for lev in level], orders=ascending) # level ordering else: - labels = list(self.labels) + codes = list(self.codes) shape = list(self.levshape) - # partition labels and shape - primary = tuple(labels.pop(lev - i) for i, lev in enumerate(level)) + # partition codes and shape + primary = tuple(codes.pop(lev - i) for i, lev in enumerate(level)) primshp = tuple(shape.pop(lev - i) for i, lev in enumerate(level)) if sort_remaining: - primary += primary + tuple(labels) + primary += primary + tuple(codes) primshp += primshp + tuple(shape) else: sortorder = level[0] @@ -1929,9 +2011,9 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): indexer = indexer[::-1] indexer = ensure_platform_int(indexer) - new_labels = [lab.take(indexer) for lab in self.labels] + new_codes = [level_codes.take(indexer) for level_codes in self.codes] - new_index = MultiIndex(labels=new_labels, levels=self.levels, + new_index = MultiIndex(codes=new_codes, levels=self.levels, names=self.names, sortorder=sortorder, verify_integrity=False) @@ -2146,7 +2228,7 @@ def _partial_tup_index(self, tup, side='left'): n = len(tup) start, end = 0, len(self) - zipped = zip(tup, self.levels, self.labels) + zipped = zip(tup, self.levels, self.codes) for k, (lab, lev, labs) in enumerate(zipped): section = labs[start:end] @@ -2199,9 +2281,9 @@ def get_loc(self, key, method=None): or a sequence of such. If you want to use those, use :meth:`MultiIndex.get_locs` instead. - See also + See Also -------- - Index.get_loc : get_loc method for (single-level) index. + Index.get_loc : The get_loc method for (single-level) index. MultiIndex.slice_locs : Get slice location given start label(s) and end label(s). MultiIndex.get_locs : Get location for a label/slice/list/mask or a @@ -2258,7 +2340,7 @@ def _maybe_to_slice(loc): loc = np.arange(start, stop, dtype='int64') for i, k in enumerate(follow_key, len(lead_key)): - mask = self.labels[i][loc] == self.levels[i].get_loc(k) + mask = self.codes[i][loc] == self.levels[i].get_loc(k) if not mask.all(): loc = loc[mask] if not len(loc): @@ -2305,7 +2387,7 @@ def get_loc_level(self, key, level=0, drop_level=True): --------- MultiIndex.get_loc : Get location for a label or a tuple of labels. MultiIndex.get_locs : Get location for a label/slice/list/mask or a - sequence of such + sequence of such. """ def maybe_droplevels(indexer, levels, drop_level): @@ -2409,15 +2491,16 @@ def _get_level_indexer(self, key, level=0, indexer=None): # if the indexer is provided, then use this level_index = self.levels[level] - labels = self.labels[level] + level_codes = self.codes[level] - def convert_indexer(start, stop, step, indexer=indexer, labels=labels): - # given the inputs and the labels/indexer, compute an indexer set + def convert_indexer(start, stop, step, indexer=indexer, + codes=level_codes): + # given the inputs and the codes/indexer, compute an indexer set # if we have a provided indexer, then this need not consider # the entire labels set r = np.arange(start, stop, step) - if indexer is not None and len(indexer) != len(labels): + if indexer is not None and len(indexer) != len(codes): # we have an indexer which maps the locations in the labels # that we have already selected (and is not an indexer for the @@ -2427,14 +2510,14 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # selected from pandas import Series mapper = Series(indexer) - indexer = labels.take(ensure_platform_int(indexer)) + indexer = codes.take(ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) m = result.map(mapper)._ndarray_values else: - m = np.zeros(len(labels), dtype=bool) - m[np.in1d(labels, r, - assume_unique=Index(labels).is_unique)] = True + m = np.zeros(len(codes), dtype=bool) + m[np.in1d(codes, r, + assume_unique=Index(codes).is_unique)] = True return m @@ -2474,8 +2557,8 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): return convert_indexer(start, stop + 1, step) else: # sorted, so can return slice object -> view - i = labels.searchsorted(start, side='left') - j = labels.searchsorted(stop, side='right') + i = level_codes.searchsorted(start, side='left') + j = level_codes.searchsorted(stop, side='right') return slice(i, j, step) else: @@ -2484,14 +2567,14 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted - locs = np.array(labels == code, dtype=bool, copy=False) + locs = np.array(level_codes == code, dtype=bool, copy=False) if not locs.any(): # The label is present in self.levels[level] but unused: raise KeyError(key) return locs - i = labels.searchsorted(code, side='left') - j = labels.searchsorted(code, side='right') + i = level_codes.searchsorted(code, side='left') + j = level_codes.searchsorted(code, side='right') if i == j: # The label is present in self.levels[level] but unused: raise KeyError(key) @@ -2525,7 +2608,7 @@ def get_locs(self, seq): >>> mi.get_locs([[True, False, True], slice('e', 'f')]) array([2], dtype=int64) - See also + See Also -------- MultiIndex.get_loc : Get location for a label or a tuple of labels. MultiIndex.slice_locs : Get slice location given start label(s) and @@ -2641,10 +2724,10 @@ def truncate(self, before=None, after=None): new_levels = list(self.levels) new_levels[0] = new_levels[0][i:j] - new_labels = [lab[left:right] for lab in self.labels] - new_labels[0] = new_labels[0] - i + new_codes = [level_codes[left:right] for level_codes in self.codes] + new_codes[0] = new_codes[0] - i - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) def equals(self, other): @@ -2652,7 +2735,7 @@ def equals(self, other): Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) - See also + See Also -------- equal_levels """ @@ -2673,26 +2756,26 @@ def equals(self, other): return False for i in range(self.nlevels): - slabels = self.labels[i] - slabels = slabels[slabels != -1] - svalues = algos.take_nd(np.asarray(self.levels[i]._values), - slabels, allow_fill=False) - - olabels = other.labels[i] - olabels = olabels[olabels != -1] - ovalues = algos.take_nd( + self_codes = self.codes[i] + self_codes = self_codes[self_codes != -1] + self_values = algos.take_nd(np.asarray(self.levels[i]._values), + self_codes, allow_fill=False) + + other_codes = other.codes[i] + other_codes = other_codes[other_codes != -1] + other_values = algos.take_nd( np.asarray(other.levels[i]._values), - olabels, allow_fill=False) + other_codes, allow_fill=False) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say # timedelta64 in self (IOW it has other values than NaT) # but types datetime64 in other (where its all NaT) # but these are equivalent - if len(svalues) == 0 and len(ovalues) == 0: + if len(self_values) == 0 and len(other_values) == 0: continue - if not array_equivalent(svalues, ovalues): + if not array_equivalent(self_values, other_values): return False return True @@ -2758,16 +2841,24 @@ def intersection(self, other): uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) if len(uniq_tuples) == 0: return MultiIndex(levels=self.levels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, names=result_names, verify_integrity=False) else: return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def difference(self, other): + def difference(self, other, sort=True): """ Compute sorted set difference of two MultiIndex objects + Parameters + ---------- + other : MultiIndex + sort : bool, default True + Sort the resulting MultiIndex if possible + + .. versionadded:: 0.24.0 + Returns ------- diff : MultiIndex @@ -2780,15 +2871,23 @@ def difference(self, other): if self.equals(other): return MultiIndex(levels=self.levels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, names=result_names, verify_integrity=False) - difference = sorted(set(self._ndarray_values) - - set(other._ndarray_values)) + this = self._get_unique_index() + + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) + + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + difference = this.values.take(label_diff) + if sort: + difference = sorted(difference) if len(difference) == 0: return MultiIndex(levels=[[]] * self.nlevels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, names=result_names, verify_integrity=False) else: return MultiIndex.from_tuples(difference, sortorder=0, @@ -2814,7 +2913,7 @@ def _convert_can_do_setop(self, other): if not hasattr(other, 'names'): if len(other) == 0: other = MultiIndex(levels=[[]] * self.nlevels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, verify_integrity=False) else: msg = 'other must be a MultiIndex or a list of tuples' @@ -2849,21 +2948,22 @@ def insert(self, loc, item): 'levels.') new_levels = [] - new_labels = [] - for k, level, labels in zip(item, self.levels, self.labels): + new_codes = [] + for k, level, level_codes in zip(item, self.levels, self.codes): if k not in level: # have to insert into level # must insert at end otherwise you have to recompute all the - # other labels + # other codes lev_loc = len(level) level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) new_levels.append(level) - new_labels.append(np.insert(ensure_int64(labels), loc, lev_loc)) + new_codes.append(np.insert( + ensure_int64(level_codes), loc, lev_loc)) - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False) def delete(self, loc): @@ -2874,8 +2974,8 @@ def delete(self, loc): ------- new_index : MultiIndex """ - new_labels = [np.delete(lab, loc) for lab in self.labels] - return MultiIndex(levels=self.levels, labels=new_labels, + new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] + return MultiIndex(levels=self.levels, codes=new_codes, names=self.names, verify_integrity=False) def _wrap_joined_index(self, joined, other): @@ -2891,13 +2991,13 @@ def isin(self, values, level=None): else: num = self._get_level_number(level) levs = self.levels[num] - labs = self.labels[num] + level_codes = self.codes[num] sought_labels = levs.isin(values).nonzero()[0] if levs.size == 0: - return np.zeros(len(labs), dtype=np.bool_) + return np.zeros(len(level_codes), dtype=np.bool_) else: - return np.lib.arraysetops.in1d(labs, sought_labels) + return np.lib.arraysetops.in1d(level_codes, sought_labels) MultiIndex._add_numeric_methods_disabled() diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 7f64fb744c682..491176bc586a8 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,26 +1,23 @@ +import warnings + import numpy as np -from pandas._libs import (index as libindex, - join as libjoin) + +from pandas._libs import index as libindex +import pandas.compat as compat +from pandas.util._decorators import Appender, cache_readonly + from pandas.core.dtypes.common import ( - is_dtype_equal, - pandas_dtype, - needs_i8_conversion, - is_integer_dtype, - is_float, - is_bool, - is_bool_dtype, - is_scalar) + is_bool, is_bool_dtype, is_dtype_equal, is_float, is_integer_dtype, + is_scalar, needs_i8_conversion, pandas_dtype) +import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna -from pandas import compat from pandas.core import algorithms import pandas.core.common as com +import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, InvalidIndexError, _index_shared_docs) -from pandas.util._decorators import Appender, cache_readonly -import pandas.core.dtypes.concat as _concat -import pandas.core.indexes.base as ibase - +from pandas.core.ops import get_op_result_name _num_index_shared_docs = dict() @@ -35,10 +32,14 @@ class NumericIndex(Index): _is_numeric_dtype = True def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False): + fastpath=None): - if fastpath: - return cls._simple_new(data, name=name) + if fastpath is not None: + warnings.warn("The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, stacklevel=2) + if fastpath: + return cls._simple_new(data, name=name) # is_scalar, generators handled in coerce_to_ndarray data = cls._coerce_to_ndarray(data) @@ -150,9 +151,9 @@ def insert(self, loc, item): ----- An Index instance can **only** contain hashable objects. - See also + See Also -------- - Index : The base pandas Index type + Index : The base pandas Index type. """ _int64_descr_args = dict( @@ -185,10 +186,6 @@ class Int64Index(IntegerIndex): __doc__ = _num_index_shared_docs['class_descr'] % _int64_descr_args _typ = 'int64index' - _left_indexer_unique = libjoin.left_join_indexer_unique_int64 - _left_indexer = libjoin.left_join_indexer_int64 - _inner_indexer = libjoin.inner_join_indexer_int64 - _outer_indexer = libjoin.outer_join_indexer_int64 _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.int64 @@ -214,7 +211,7 @@ def _convert_scalar_indexer(self, key, kind=None): ._convert_scalar_indexer(key, kind=kind)) def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) return Int64Index(joined, name=name) @classmethod @@ -243,10 +240,6 @@ class UInt64Index(IntegerIndex): __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args _typ = 'uint64index' - _left_indexer_unique = libjoin.left_join_indexer_unique_uint64 - _left_indexer = libjoin.left_join_indexer_uint64 - _inner_indexer = libjoin.inner_join_indexer_uint64 - _outer_indexer = libjoin.outer_join_indexer_uint64 _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 @@ -291,7 +284,7 @@ def _convert_index_indexer(self, keyarr): return keyarr def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) return UInt64Index(joined, name=name) @classmethod @@ -321,11 +314,6 @@ class Float64Index(NumericIndex): _typ = 'float64index' _engine_type = libindex.Float64Engine - _left_indexer_unique = libjoin.left_join_indexer_unique_float64 - _left_indexer = libjoin.left_join_indexer_float64 - _inner_indexer = libjoin.inner_join_indexer_float64 - _outer_indexer = libjoin.outer_join_indexer_float64 - _default_dtype = np.float64 @property diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index cc008694a8b84..3d69a0a84f7ae 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,81 +1,79 @@ # pylint: disable=E1101,E1103,W0232 -from datetime import datetime -import numpy as np +from datetime import datetime, timedelta import warnings -from pandas.core import common as com +import numpy as np + +from pandas._libs import index as libindex +from pandas._libs.tslibs import NaT, iNaT, resolution +from pandas._libs.tslibs.period import ( + DIFFERENT_FREQ_INDEX, IncompatibleFrequency, Period) +from pandas.util._decorators import ( + Appender, Substitution, cache_readonly, deprecate_kwarg) + from pandas.core.dtypes.common import ( - is_integer, - is_float, - is_integer_dtype, - is_float_dtype, - is_scalar, - is_datetime64_dtype, - is_datetime64_any_dtype, - is_period_dtype, - is_bool_dtype, - pandas_dtype, - ensure_object) - -import pandas.tseries.frequencies as frequencies -from pandas.tseries.frequencies import get_freq_code as _gfc - -from pandas.core.indexes.datetimes import DatetimeIndex, Int64Index, Index -from pandas.core.indexes.datetimelike import DatelikeOps, DatetimeIndexOpsMixin -from pandas.core.tools.datetimes import parse_time_string - -from pandas._libs.lib import infer_dtype -from pandas._libs import tslib, index as libindex, Timedelta -from pandas._libs.tslibs.period import (Period, IncompatibleFrequency, - DIFFERENT_FREQ_INDEX, - _validate_end_alias) -from pandas._libs.tslibs import resolution, period - -from pandas.core.arrays.period import PeriodArrayMixin + is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype, + is_integer, is_integer_dtype, pandas_dtype) + +from pandas import compat +from pandas.core import common as com +from pandas.core.accessor import delegate_names +from pandas.core.algorithms import unique1d +from pandas.core.arrays.datetimelike import DatelikeOps +from pandas.core.arrays.period import ( + PeriodArray, period_array, validate_dtype_freq) from pandas.core.base import _shared_docs +import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, ensure_index +from pandas.core.indexes.datetimelike import ( + DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, wrap_arithmetic_op) +from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index +from pandas.core.missing import isna +from pandas.core.ops import get_op_result_name +from pandas.core.tools.datetimes import DateParseError, parse_time_string -from pandas import compat -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.tseries import frequencies +from pandas.tseries.offsets import DateOffset, Tick -import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(target_klass='PeriodIndex or list of Periods')) -def _wrap_field_accessor(name): - fget = getattr(PeriodArrayMixin, name).fget - - def f(self): - result = fget(self) - return Index(result, name=self.name) - - f.__name__ = name - f.__doc__ = fget.__doc__ - return property(f) - - -def dt64arr_to_periodarr(data, freq, tz): - if data.dtype != np.dtype('M8[ns]'): - raise ValueError('Wrong dtype: %s' % data.dtype) - - freq = Period._maybe_convert_freq(freq) - base, mult = _gfc(freq) - return period.dt64arr_to_periodarr(data.view('i8'), base, tz) - # --- Period index sketch def _new_PeriodIndex(cls, **d): # GH13277 for unpickling - if d['data'].dtype == 'int64': - values = d.pop('data') - return cls._from_ordinals(values=values, **d) + values = d.pop('data') + if values.dtype == 'int64': + freq = d.pop('freq', None) + values = PeriodArray(values, freq=freq) + return cls._simple_new(values, **d) + else: + return cls(values, **d) -class PeriodIndex(PeriodArrayMixin, DatelikeOps, DatetimeIndexOpsMixin, - Int64Index): +class PeriodDelegateMixin(DatetimelikeDelegateMixin): + """ + Delegate from PeriodIndex to PeriodArray. + """ + _delegate_class = PeriodArray + _delegated_properties = PeriodArray._datetimelike_ops + _delegated_methods = ( + set(PeriodArray._datetimelike_methods) | {'_addsub_int_array'} + ) + _raw_properties = {'is_leap_year'} + + +@delegate_names(PeriodArray, + PeriodDelegateMixin._delegated_properties, + typ='property') +@delegate_names(PeriodArray, + PeriodDelegateMixin._delegated_methods, + typ="method") +class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index, + PeriodDelegateMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time such as particular years, quarters, months, etc. @@ -148,32 +146,25 @@ class PeriodIndex(PeriodArrayMixin, DatelikeOps, DatetimeIndexOpsMixin, See Also --------- - Index : The base pandas Index type - Period : Represents a period of time - DatetimeIndex : Index with datetime64 data - TimedeltaIndex : Index of timedelta64 data + Index : The base pandas Index type. + Period : Represents a period of time. + DatetimeIndex : Index with datetime64 data. + TimedeltaIndex : Index of timedelta64 data. """ _typ = 'periodindex' _attributes = ['name', 'freq'] # define my properties & methods for delegation - _other_ops = [] - _bool_ops = ['is_leap_year'] - _object_ops = ['start_time', 'end_time', 'freq'] - _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'weekday', 'week', 'dayofweek', - 'dayofyear', 'quarter', 'qyear', - 'days_in_month', 'daysinmonth'] - _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ['strftime', 'to_timestamp', 'asfreq'] - _is_numeric_dtype = False _infer_as_myclass = True - _freq = None + _data = None # type: PeriodArray _engine_type = libindex.PeriodEngine + # ------------------------------------------------------------------------ + # Index Constructors + def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, periods=None, tz=None, dtype=None, copy=False, name=None, **fields): @@ -185,121 +176,229 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, raise TypeError('__new__() got an unexpected keyword argument {}'. format(list(set(fields) - valid_field_set)[0])) - if periods is not None: - if is_float(periods): - periods = int(periods) - elif not is_integer(periods): - msg = 'periods must be a number, got {periods}' - raise TypeError(msg.format(periods=periods)) - if name is None and hasattr(data, 'name'): name = data.name - if dtype is not None: - dtype = pandas_dtype(dtype) - if not is_period_dtype(dtype): - raise ValueError('dtype must be PeriodDtype') - if freq is None: - freq = dtype.freq - elif freq != dtype.freq: - msg = 'specified freq and dtype are different' - raise IncompatibleFrequency(msg) + if data is None and ordinal is None: + # range-based. + data, freq = PeriodArray._generate_range(start, end, periods, + freq, fields) + data = PeriodArray(data, freq=freq) + else: + freq = validate_dtype_freq(dtype, freq) - # coerce freq to freq object, otherwise it can be coerced elementwise - # which is slow - if freq: - freq = Period._maybe_convert_freq(freq) + # PeriodIndex allow PeriodIndex(period_index, freq=different) + # Let's not encourage that kind of behavior in PeriodArray. - if data is None: - if ordinal is not None: - data = np.asarray(ordinal, dtype=np.int64) - else: - data, freq = cls._generate_range(start, end, periods, - freq, fields) - return cls._from_ordinals(data, name=name, freq=freq) - - if isinstance(data, PeriodIndex): - if freq is None or freq == data.freq: # no freq change - freq = data.freq - data = data._ndarray_values + if freq and isinstance(data, cls) and data.freq != freq: + # TODO: We can do some of these with no-copy / coercion? + # e.g. D -> 2D seems to be OK + data = data.asfreq(freq) + + if data is None and ordinal is not None: + # we strangely ignore `ordinal` if data is passed. + ordinal = np.asarray(ordinal, dtype=np.int64) + data = PeriodArray(ordinal, freq) else: - base1, _ = _gfc(data.freq) - base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._ndarray_values, - base1, base2, 1) - return cls._simple_new(data, name=name, freq=freq) - - # not array / index - if not isinstance(data, (np.ndarray, PeriodIndex, - DatetimeIndex, Int64Index)): - if is_scalar(data) or isinstance(data, Period): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - - data = np.asarray(data) - - # datetime other than period - if is_datetime64_dtype(data.dtype): - data = dt64arr_to_periodarr(data, freq, tz) - return cls._from_ordinals(data, name=name, freq=freq) - - # check not floats - if infer_dtype(data) == 'floating' and len(data) > 0: - raise TypeError("PeriodIndex does not allow " - "floating point in construction") - - # anything else, likely an array of strings or periods - data = ensure_object(data) - freq = freq or period.extract_freq(data) - data = period.extract_ordinals(data, freq) - return cls._from_ordinals(data, name=name, freq=freq) + # don't pass copy here, since we copy later. + data = period_array(data=data, freq=freq) - @cache_readonly - def _engine(self): - return self._engine_type(lambda: self, len(self)) + if copy: + data = data.copy() + + return cls._simple_new(data, name=name) @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): """ - Values can be any type that can be coerced to Periods. - Ordinals in an ndarray are fastpath-ed to `_from_ordinals` - """ - if not is_integer_dtype(values): - values = np.array(values, copy=False) - if len(values) > 0 and is_float_dtype(values): - raise TypeError("PeriodIndex can't take floats") - return cls(values, name=name, freq=freq, **kwargs) + Create a new PeriodIndex. - return cls._from_ordinals(values, name, freq, **kwargs) + Parameters + ---------- + values : PeriodArray, PeriodIndex, Index[int64], ndarray[int64] + Values that can be converted to a PeriodArray without inference + or coercion. - @classmethod - def _from_ordinals(cls, values, name=None, freq=None, **kwargs): - """ - Values should be int ordinals - `__new__` & `_simple_new` cooerce to ordinals and call this method """ - result = super(PeriodIndex, cls)._from_ordinals(values, freq) - + # TODO: raising on floats is tested, but maybe not useful. + # Should the callers know not to pass floats? + # At the very least, I think we can ensure that lists aren't passed. + if isinstance(values, list): + values = np.asarray(values) + if is_float_dtype(values): + raise TypeError("PeriodIndex._simple_new does not accept floats.") + values = PeriodArray(values, freq=freq) + + if not isinstance(values, PeriodArray): + raise TypeError("PeriodIndex._simple_new only accepts PeriodArray") + result = object.__new__(cls) + result._data = values result.name = name result._reset_identity() return result - def _shallow_copy_with_infer(self, values, **kwargs): + # ------------------------------------------------------------------------ + # Wrapping PeriodArray + + # ------------------------------------------------------------------------ + # Data + + @property + def _ndarray_values(self): + return self._data._ndarray_values + + @property + def values(self): + return np.asarray(self) + + @property + def _values(self): + return self._data + + @property + def freq(self): + # TODO(DatetimeArray): remove + # Can't simply use delegate_names since our base class is defining + # freq + return self._data.freq + + @freq.setter + def freq(self, value): + value = Period._maybe_convert_freq(value) + msg = ('Setting {cls}.freq has been deprecated and will be ' + 'removed in a future version; use {cls}.asfreq instead. ' + 'The {cls}.freq setter is not guaranteed to work.') + warnings.warn(msg.format(cls=type(self).__name__), + FutureWarning, stacklevel=2) + # PeriodArray._freq isn't actually mutable. We set the private _freq + # here, but people shouldn't be doing this anyway. + self._data._freq = value + + def _shallow_copy(self, values=None, **kwargs): + # TODO: simplify, figure out type of values + if values is None: + values = self._data + + if isinstance(values, type(self)): + values = values._values + + if not isinstance(values, PeriodArray): + if (isinstance(values, np.ndarray) and + is_integer_dtype(values.dtype)): + values = PeriodArray(values, freq=self.freq) + else: + # in particular, I would like to avoid period_array here. + # Some people seem to be calling use with unexpected types + # Index.difference -> ndarray[Period] + # DatetimelikeIndexOpsMixin.repeat -> ndarray[ordinal] + # I think that once all of Datetime* are EAs, we can simplify + # this quite a bit. + values = period_array(values, freq=self.freq) + + # I don't like overloading shallow_copy with freq changes. + # See if it's used anywhere outside of test_resample_empty_dataframe + attributes = self._get_attributes_dict() + freq = kwargs.pop("freq", None) + if freq: + values = values.asfreq(freq) + attributes.pop("freq", None) + + attributes.update(kwargs) + if not len(values) and 'dtype' not in kwargs: + attributes['dtype'] = self.dtype + return self._simple_new(values, **attributes) + + def _shallow_copy_with_infer(self, values=None, **kwargs): """ we always want to return a PeriodIndex """ return self._shallow_copy(values=values, **kwargs) - def _coerce_scalar_to_index(self, item): + @property + def _box_func(self): + """Maybe box an ordinal or Period""" + # TODO(DatetimeArray): Avoid double-boxing + # PeriodArray takes care of boxing already, so we need to check + # whether we're given an ordinal or a Period. It seems like some + # places outside of indexes/period.py are calling this _box_func, + # but passing data that's already boxed. + def func(x): + if isinstance(x, Period) or x is NaT: + return x + else: + return Period._from_ordinal(ordinal=x, freq=self.freq) + return func + + def _maybe_box_as_values(self, values, **attribs): + """Box an array of ordinals to a PeriodArray + + This is purely for compatibility between PeriodIndex + and Datetime/TimedeltaIndex. Once these are all backed by + an ExtensionArray, this can be removed """ - we need to coerce a scalar to a compat for our index type + # TODO(DatetimeArray): remove + freq = attribs['freq'] + return PeriodArray(values, freq=freq) + + def _maybe_convert_timedelta(self, other): + """ + Convert timedelta-like input to an integer multiple of self.freq Parameters ---------- - item : scalar item to coerce + other : timedelta, np.timedelta64, DateOffset, int, np.ndarray + + Returns + ------- + converted : int, np.ndarray[int64] + + Raises + ------ + IncompatibleFrequency : if the input cannot be written as a multiple + of self.freq. Note IncompatibleFrequency subclasses ValueError. """ - return PeriodIndex([item], **self._get_attributes_dict()) + if isinstance( + other, (timedelta, np.timedelta64, Tick, np.ndarray)): + offset = frequencies.to_offset(self.freq.rule_code) + if isinstance(offset, Tick): + # _check_timedeltalike_freq_compat will raise if incompatible + delta = self._data._check_timedeltalike_freq_compat(other) + return delta + elif isinstance(other, DateOffset): + freqstr = other.rule_code + base = frequencies.get_base_alias(freqstr) + if base == self.freq.rule_code: + return other.n + msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) + raise IncompatibleFrequency(msg) + elif is_integer(other): + # integer is passed to .shift via + # _add_datetimelike_methods basically + # but ufunc may pass integer to _add_delta + return other + + # raise when input doesn't have freq + msg = "Input has different freq from {cls}(freq={freqstr})" + raise IncompatibleFrequency(msg.format(cls=type(self).__name__, + freqstr=self.freqstr)) + + # ------------------------------------------------------------------------ + # Rendering Methods + + def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs): + # just dispatch, return ndarray + return self._data._format_native_types(na_rep=na_rep, + quoting=quoting, + **kwargs) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.astype(object).values + + # ------------------------------------------------------------------------ + # Indexing + + @cache_readonly + def _engine(self): + return self._engine_type(lambda: self, len(self)) @Appender(_index_shared_docs['__contains__']) def __contains__(self, key): @@ -319,11 +418,48 @@ def __contains__(self, key): @cache_readonly def _int64index(self): - return Int64Index(self.asi8, name=self.name, fastpath=True) + return Int64Index._simple_new(self.asi8, name=self.name) - @property - def values(self): - return self.astype(object).values + # ------------------------------------------------------------------------ + # Index Methods + + @deprecate_kwarg(old_arg_name='n', new_arg_name='periods') + def shift(self, periods): + """ + Shift index by desired number of increments. + + This method is for shifting the values of period indexes + by a specified time increment. + + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + + .. versionchanged:: 0.24.0 + + Returns + ------- + pandas.PeriodIndex + Shifted index. + + See Also + -------- + DatetimeIndex.shift : Shift values of DatetimeIndex. + """ + i8values = self._data._time_shift(periods) + return self._simple_new(i8values, name=self.name, freq=self.freq) + + def _coerce_scalar_to_index(self, item): + """ + we need to coerce a scalar to a compat for our index type + + Parameters + ---------- + item : scalar item to coerce + """ + return PeriodIndex([item], **self._get_attributes_dict()) def __array__(self, dtype=None): if is_integer_dtype(dtype): @@ -340,9 +476,9 @@ def __array_wrap__(self, result, context=None): """ if isinstance(context, tuple) and len(context) > 0: func = context[0] - if (func is np.add): + if func is np.add: pass - elif (func is np.subtract): + elif func is np.subtract: name = self.name left = context[1][0] right = context[1][1] @@ -363,21 +499,11 @@ def __array_wrap__(self, result, context=None): return result # the result is object dtype array of Period # cannot pass _simple_new as it is - return self._shallow_copy(result, freq=self.freq, name=self.name) - - @property - def size(self): - # Avoid materializing self._values - return self._ndarray_values.size - - @property - def shape(self): - # Avoid materializing self._values - return self._ndarray_values.shape + return type(self)(result, freq=self.freq, name=self.name) @property def _formatter_func(self): - return lambda x: "'%s'" % x + return self.array._formatter(boxed=False) def asof_locs(self, where, mask): """ @@ -404,14 +530,17 @@ def asof_locs(self, where, mask): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) - if is_integer_dtype(dtype): - return self._int64index.copy() if copy else self._int64index - elif is_datetime64_any_dtype(dtype): + + # We have a few special-cases for `dtype`. + # Failing those, we fall back to astyping the values + + if is_datetime64_any_dtype(dtype): + # 'how' is index-speicifc, isn't part of the EA interface. tz = getattr(dtype, 'tz', None) return self.to_timestamp(how=how).tz_localize(tz) - elif is_period_dtype(dtype): - return self.asfreq(freq=dtype.freq) - return super(PeriodIndex, self).astype(dtype, copy=copy) + + result = self._data.astype(dtype, copy=copy) + return Index(result, name=self.name, dtype=dtype, copy=False) @Substitution(klass='PeriodIndex') @Appender(_shared_docs['searchsorted']) @@ -422,7 +551,10 @@ def searchsorted(self, value, side='left', sorter=None): raise IncompatibleFrequency(msg) value = value.ordinal elif isinstance(value, compat.string_types): - value = Period(value, freq=self.freq).ordinal + try: + value = Period(value, freq=self.freq).ordinal + except DateParseError: + raise KeyError("Cannot interpret '{}' as period".format(value)) return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) @@ -444,73 +576,6 @@ def is_full(self): values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() - year = _wrap_field_accessor('year') - month = _wrap_field_accessor('month') - day = _wrap_field_accessor('day') - hour = _wrap_field_accessor('hour') - minute = _wrap_field_accessor('minute') - second = _wrap_field_accessor('second') - weekofyear = _wrap_field_accessor('week') - week = weekofyear - dayofweek = _wrap_field_accessor('dayofweek') - weekday = dayofweek - dayofyear = day_of_year = _wrap_field_accessor('dayofyear') - quarter = _wrap_field_accessor('quarter') - qyear = _wrap_field_accessor('qyear') - days_in_month = _wrap_field_accessor('days_in_month') - daysinmonth = days_in_month - - @property - def start_time(self): - return self.to_timestamp(how='start') - - @property - def end_time(self): - return self.to_timestamp(how='end') - - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.astype(object).values - - def to_timestamp(self, freq=None, how='start'): - """ - Cast to DatetimeIndex - - Parameters - ---------- - freq : string or DateOffset, optional - Target frequency. The default is 'D' for week or longer, - 'S' otherwise - how : {'s', 'e', 'start', 'end'} - - Returns - ------- - DatetimeIndex - """ - how = _validate_end_alias(how) - - end = how == 'E' - if end: - if freq == 'B': - # roll forward to ensure we land on B date - adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') - return self.to_timestamp(how='start') + adjust - else: - adjust = Timedelta(1, 'ns') - return (self + 1).to_timestamp(how='start') - adjust - - if freq is None: - base, mult = _gfc(self.freq) - freq = frequencies.get_to_timestamp_base(base) - else: - freq = Period._maybe_convert_freq(freq) - - base, mult = _gfc(freq) - new_data = self.asfreq(freq, how) - - new_data = period.periodarr_to_dt64arr(new_data._ndarray_values, base) - return DatetimeIndex(new_data, freq='infer', name=self.name) - @property def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous @@ -556,7 +621,8 @@ def get_value(self, series, key): except TypeError: pass - key = Period(key, self.freq).ordinal + period = Period(key, self.freq) + key = period.value if isna(period) else period.ordinal return com.maybe_box(self, self._engine.get_value(s, key), series, key) @@ -585,6 +651,18 @@ def _get_unique_index(self, dropna=False): res = res.dropna() return res + @Appender(Index.unique.__doc__) + def unique(self, level=None): + # override the Index.unique method for performance GH#23083 + if level is not None: + # this should never occur, but is retained to make the signature + # match Index.unique + self._validate_index_level(level) + + values = self._ndarray_values + result = unique1d(values) + return self._shallow_copy(result) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -604,6 +682,9 @@ def get_loc(self, key, method=None, tolerance=None): key = asdt except TypeError: pass + except DateParseError: + # A string with invalid format + raise KeyError("Cannot interpret '{}' as period".format(key)) try: key = Period(key, freq=self.freq) @@ -613,7 +694,7 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) try: - ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal + ordinal = iNaT if key is NaT else key.ordinal if tolerance is not None: tolerance = self._convert_tolerance(tolerance, np.asarray(key)) @@ -741,37 +822,18 @@ def _assert_can_do_setop(self, other): msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) result = self._apply_meta(result) result.name = name return result def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._from_ordinals(rawarr, freq=self.freq, - name=self.name) + rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, + name=self.name) return rawarr - def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): - - values = self.astype(object).values - - if date_format: - formatter = lambda dt: dt.strftime(date_format) - else: - formatter = lambda dt: u'%s' % dt - - if self.hasnans: - mask = self._isnan - values[mask] = na_rep - imask = ~mask - values[imask] = np.array([formatter(dt) for dt - in values[imask]]) - else: - values = np.array([formatter(dt) for dt in values]) - return values - def __setstate__(self, state): """Necessary for making this object picklable""" @@ -787,12 +849,14 @@ def __setstate__(self, state): np.ndarray.__setstate__(data, nd_state) # backcompat - self._freq = Period._maybe_convert_freq(own_state[1]) + freq = Period._maybe_convert_freq(own_state[1]) else: # pragma: no cover data = np.empty(state) np.ndarray.__setstate__(self, state) + freq = None # ? + data = PeriodArray(data, freq=freq) self._data = data else: @@ -800,6 +864,106 @@ def __setstate__(self, state): _unpickle_compat = __setstate__ + @classmethod + def _add_datetimelike_methods(cls): + """ + add in the datetimelike methods (as we may have to override the + superclass) + """ + # TODO(DatetimeArray): move this up to DatetimeArrayMixin + + def __add__(self, other): + # dispatch to ExtensionArray implementation + result = self._data.__add__(other) + return wrap_arithmetic_op(self, other, result) + + cls.__add__ = __add__ + + def __radd__(self, other): + # alias for __add__ + return self.__add__(other) + cls.__radd__ = __radd__ + + def __sub__(self, other): + # dispatch to ExtensionArray implementation + result = self._data.__sub__(other) + return wrap_arithmetic_op(self, other, result) + + cls.__sub__ = __sub__ + + def __rsub__(self, other): + result = self._data.__rsub__(other) + return wrap_arithmetic_op(self, other, result) + + cls.__rsub__ = __rsub__ + + @classmethod + def _create_comparison_method(cls, op): + """ + Create a comparison method that dispatches to ``cls.values``. + """ + # TODO(DatetimeArray): move to base class. + def wrapper(self, other): + return op(self._data, other) + + wrapper.__doc__ = op.__doc__ + wrapper.__name__ = '__{}__'.format(op.__name__) + return wrapper + + def repeat(self, repeats, *args, **kwargs): + # TODO(DatetimeArray): Just use Index.repeat + return Index.repeat(self, repeats, *args, **kwargs) + + def view(self, dtype=None, type=None): + # TODO(DatetimeArray): remove + if dtype is None or dtype is __builtins__['type'](self): + return self + return self._ndarray_values.view(dtype=dtype) + + @property + def flags(self): + """ return the ndarray.flags for the underlying data """ + warnings.warn("{obj}.flags is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, stacklevel=2) + return self._ndarray_values.flags + + @property + def asi8(self): + # TODO(DatetimeArray): remove + return self.view('i8') + + def item(self): + """ + return the first element of the underlying data as a python + scalar + """ + # TODO(DatetimeArray): remove + if len(self) == 1: + return self[0] + else: + # copy numpy's message here because Py26 raises an IndexError + raise ValueError('can only convert an array of size 1 to a ' + 'Python scalar') + + @property + def data(self): + """ return the data pointer of the underlying data """ + warnings.warn("{obj}.data is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, stacklevel=2) + return np.asarray(self._data).data + + @property + def base(self): + """ return the base object if the memory of the underlying data is + shared + """ + warnings.warn("{obj}.base is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, stacklevel=2) + return np.asarray(self._data) + PeriodIndex._add_comparison_ops() PeriodIndex._add_numeric_methods_disabled() @@ -807,14 +971,6 @@ def __setstate__(self, state): PeriodIndex._add_datetimelike_methods() -def pnow(freq=None): - # deprecation, xref #13790 - warnings.warn("pd.pnow() and pandas.core.indexes.period.pnow() " - "are deprecated. Please use Period.now()", - FutureWarning, stacklevel=2) - return Period.now(freq=freq) - - def period_range(start=None, end=None, periods=None, freq='D', name=None): """ Return a fixed frequency PeriodIndex, with day (calendar) as the default diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fd8e17c369f5a..364aadb9523f0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,33 +1,30 @@ -from sys import getsizeof -import operator from datetime import timedelta +import operator +from sys import getsizeof +import warnings import numpy as np -from pandas._libs import index as libindex -from pandas.core.dtypes.common import ( - is_integer, - is_scalar, - is_timedelta64_dtype, - is_int64_dtype) -from pandas.core.dtypes.generic import ABCSeries, ABCTimedeltaIndex - -from pandas import compat -from pandas.compat import lrange, range, get_range_parameters +from pandas._libs import index as libindex, lib +import pandas.compat as compat +from pandas.compat import get_range_parameters, lrange, range from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes import concat as _concat +from pandas.core.dtypes.common import ( + is_int64_dtype, is_integer, is_scalar, is_timedelta64_dtype) +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCSeries, ABCTimedeltaIndex) -import pandas.core.common as com from pandas.core import ops -from pandas.core.indexes.base import Index, _index_shared_docs -from pandas.util._decorators import Appender, cache_readonly -import pandas.core.dtypes.concat as _concat +import pandas.core.common as com import pandas.core.indexes.base as ibase - +from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index class RangeIndex(Int64Index): - """ Immutable Index implementing a monotonic integer range. @@ -40,7 +37,7 @@ class RangeIndex(Int64Index): Parameters ---------- - start : int (default: 0), or other RangeIndex instance. + start : int (default: 0), or other RangeIndex instance If int and "stop" is not given, interpreted as "stop" instead. stop : int (default: 0) step : int (default: 1) @@ -51,8 +48,8 @@ class RangeIndex(Int64Index): See Also -------- - Index : The base pandas Index type - Int64Index : Index of int64 data + Index : The base pandas Index type. + Int64Index : Index of int64 data. Attributes ---------- @@ -66,11 +63,18 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, start=None, stop=None, step=None, - dtype=None, copy=False, name=None, fastpath=False): + dtype=None, copy=False, name=None, fastpath=None): - if fastpath: - return cls._simple_new(start, stop, step, name=name) + if fastpath is not None: + warnings.warn("The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, stacklevel=2) + if fastpath: + return cls._simple_new(start, stop, step, name=name) cls._validate_dtype(dtype) @@ -120,7 +124,7 @@ def ensure_int(value, field): @classmethod def from_range(cls, data, name=None, dtype=None, **kwargs): - """ create RangeIndex from a range (py3), or xrange (py2) object """ + """ Create RangeIndex from a range (py3), or xrange (py2) object. """ if not isinstance(data, range): raise TypeError( '{0}(...) must be called with object coercible to a ' @@ -156,6 +160,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @staticmethod def _validate_dtype(dtype): """ require dtype to be None or int64 """ @@ -173,7 +179,7 @@ def _data(self): @cache_readonly def _int64index(self): - return Int64Index(self._data, name=self.name, fastpath=True) + return Int64Index._simple_new(self._data, name=self.name) def _get_data_as_items(self): """ return a list of tuples of start, stop, step """ @@ -186,6 +192,9 @@ def __reduce__(self): d.update(dict(self._get_data_as_items())) return ibase._new_Index, (self.__class__, d), None + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr, formatted_value) @@ -199,6 +208,8 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + # -------------------------------------------------------------------- + @cache_readonly def nbytes(self): """ @@ -261,8 +272,9 @@ def tolist(self): @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is None: - return RangeIndex(name=self.name, fastpath=True, - **dict(self._get_data_as_items())) + name = kwargs.get("name", self.name) + return RangeIndex._simple_new( + name=name, **dict(self._get_data_as_items())) else: kwargs.setdefault('name', self.name) return self._int64index._shallow_copy(values, **kwargs) @@ -272,8 +284,8 @@ def copy(self, name=None, deep=False, dtype=None, **kwargs): self._validate_dtype(dtype) if name is None: name = self.name - return RangeIndex(name=name, fastpath=True, - **dict(self._get_data_as_items())) + return RangeIndex._simple_new( + name=name, **dict(self._get_data_as_items())) def _minmax(self, meth): no_steps = len(self) - 1 @@ -302,7 +314,7 @@ def argsort(self, *args, **kwargs): ------- argsorted : numpy array - See also + See Also -------- numpy.ndarray.argsort """ @@ -342,6 +354,10 @@ def intersection(self, other): ------- intersection : Index """ + + if self.equals(other): + return self._get_reconciled_name_object(other) + if not isinstance(other, RangeIndex): return super(RangeIndex, self).intersection(other) @@ -373,7 +389,7 @@ def intersection(self, other): tmp_start = first._start + (second._start - first._start) * \ first._step // gcd * s new_step = first._step * second._step // gcd - new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) + new_index = RangeIndex._simple_new(tmp_start, int_high, new_step) # adjust index to limiting interval new_index._start = new_index._min_fitting_element(int_low) @@ -422,10 +438,9 @@ def union(self, other): union : Index """ self._assert_can_do_setop(other) - if len(other) == 0 or self.equals(other): - return self - if len(self) == 0: - return other + if len(other) == 0 or self.equals(other) or len(self) == 0: + return super(RangeIndex, self).union(other) + if isinstance(other, RangeIndex): start_s, step_s = self._start, self._step end_s = self._start + self._step * (len(self) - 1) @@ -496,7 +511,12 @@ def __getitem__(self, key): super_getitem = super(RangeIndex, self).__getitem__ if is_scalar(key): - n = int(key) + if not lib.is_integer(key): + raise IndexError("only integers, slices (`:`), " + "ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean " + "arrays are valid indices") + n = com.cast_scalar_indexer(key) if n != key: return super_getitem(key) if n < 0: @@ -551,12 +571,15 @@ def __getitem__(self, key): stop = self._start + self._step * stop step = self._step * step - return RangeIndex(start, stop, step, name=self.name, fastpath=True) + return RangeIndex._simple_new(start, stop, step, name=self.name) # fall back to Int64Index return super_getitem(key) def __floordiv__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + if is_integer(other) and other != 0: if (len(self) == 0 or self._start % other == 0 and @@ -564,12 +587,12 @@ def __floordiv__(self, other): start = self._start // other step = self._step // other stop = start + len(self) * step - return RangeIndex(start, stop, step, name=self.name, - fastpath=True) + return RangeIndex._simple_new( + start, stop, step, name=self.name) if len(self) == 1: start = self._start // other - return RangeIndex(start, start + 1, 1, name=self.name, - fastpath=True) + return RangeIndex._simple_new( + start, start + 1, 1, name=self.name) return self._int64index // other @classmethod @@ -588,7 +611,7 @@ def _make_evaluate_binop(op, step=False): """ def _evaluate_numeric_binop(self, other): - if isinstance(other, ABCSeries): + if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented elif isinstance(other, ABCTimedeltaIndex): # Defer to TimedeltaIndex implementation @@ -644,7 +667,8 @@ def _evaluate_numeric_binop(self, other): return op(self._int64index, other) # TODO: Do attrs get handled reliably? - return _evaluate_numeric_binop + name = '__{name}__'.format(name=op.__name__) + return compat.set_function_name(_evaluate_numeric_binop, name, cls) cls.__add__ = _make_evaluate_binop(operator.add) cls.__radd__ = _make_evaluate_binop(ops.radd) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 933bc6233dca9..1c84e592d3a0d 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,73 +1,54 @@ """ implement the TimedeltaIndex """ -import operator from datetime import datetime +import warnings import numpy as np + +from pandas._libs import ( + NaT, Timedelta, index as libindex, join as libjoin, lib) +import pandas.compat as compat +from pandas.util._decorators import Appender, Substitution + from pandas.core.dtypes.common import ( - _TD_DTYPE, - is_integer, - is_float, - is_bool_dtype, - is_list_like, - is_scalar, - is_timedelta64_dtype, - is_timedelta64_ns_dtype, - pandas_dtype, - ensure_int64) + _TD_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, + is_timedelta64_dtype, is_timedelta64_ns_dtype, pandas_dtype) +import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna -from pandas.core.arrays.timedeltas import ( - TimedeltaArrayMixin, _is_convertible_to_td, _to_m8) from pandas.core.arrays import datetimelike as dtl - -from pandas.core.indexes.base import Index -from pandas.core.indexes.numeric import Int64Index -import pandas.compat as compat - -from pandas.tseries.frequencies import to_offset +from pandas.core.arrays.timedeltas import ( + TimedeltaArrayMixin as TimedeltaArray, _is_convertible_to_td, _to_m8) from pandas.core.base import _shared_docs -from pandas.core.indexes.base import _index_shared_docs import pandas.core.common as com -import pandas.core.dtypes.concat as _concat -from pandas.util._decorators import Appender, Substitution +from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - TimelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op) -from pandas.core.tools.timedeltas import ( - to_timedelta, _coerce_scalar_to_timedelta_type) -from pandas._libs import (lib, index as libindex, - join as libjoin, Timedelta, NaT) - + DatetimeIndexOpsMixin, wrap_arithmetic_op, wrap_array_method, + wrap_field_accessor) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name +from pandas.core.tools.timedeltas import _coerce_scalar_to_timedelta_type -def _wrap_field_accessor(name): - fget = getattr(TimedeltaArrayMixin, name).fget +from pandas.tseries.frequencies import to_offset - def f(self): - result = fget(self) - return Index(result, name=self.name) - f.__name__ = name - f.__doc__ = fget.__doc__ - return property(f) +def _make_wrapped_arith_op(opname): + meth = getattr(TimedeltaArray, opname) -def _td_index_cmp(cls, op): - """ - Wrap comparison operations to convert timedelta-like to timedelta64 - """ - opname = '__{name}__'.format(name=op.__name__) + def method(self, other): + oth = other + if isinstance(other, Index): + oth = other._data - def wrapper(self, other): - result = getattr(TimedeltaArrayMixin, opname)(self, other) - if is_bool_dtype(result): - # support of bool dtype indexers - return result - return Index(result) + result = meth(self, oth) + return wrap_arithmetic_op(self, other, result) - return compat.set_function_name(wrapper, opname, cls) + method.__name__ = opname + return method -class TimedeltaIndex(TimedeltaArrayMixin, DatetimeIndexOpsMixin, - TimelikeOps, Int64Index): +class TimedeltaIndex(TimedeltaArray, DatetimeIndexOpsMixin, + dtl.TimelikeOps, Int64Index): """ Immutable ndarray of timedelta64 data, represented internally as int64, and which can be boxed to timedelta objects @@ -76,7 +57,7 @@ class TimedeltaIndex(TimedeltaArrayMixin, DatetimeIndexOpsMixin, ---------- data : array-like (1-dimensional), optional Optional timedelta-like data to construct index with - unit: unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional + unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional which is an integer/float number freq : string or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string @@ -107,10 +88,10 @@ class TimedeltaIndex(TimedeltaArrayMixin, DatetimeIndexOpsMixin, See Also --------- - Index : The base pandas Index type + Index : The base pandas Index type. Timedelta : Represents a duration between two dates or times. - DatetimeIndex : Index of datetime64 data - PeriodIndex : Index of Period data + DatetimeIndex : Index of datetime64 data. + PeriodIndex : Index of Period data. Attributes ---------- @@ -153,16 +134,6 @@ def _join_i8_wrapper(joinf, **kwargs): _datetimelike_methods = ["to_pytimedelta", "total_seconds", "round", "floor", "ceil"] - @classmethod - def _add_comparison_methods(cls): - """ add in comparison methods """ - cls.__eq__ = _td_index_cmp(cls, operator.eq) - cls.__ne__ = _td_index_cmp(cls, operator.ne) - cls.__lt__ = _td_index_cmp(cls, operator.lt) - cls.__gt__ = _td_index_cmp(cls, operator.gt) - cls.__le__ = _td_index_cmp(cls, operator.le) - cls.__ge__ = _td_index_cmp(cls, operator.ge) - _engine_type = libindex.TimedeltaEngine _comparables = ['name', 'freq'] @@ -172,75 +143,66 @@ def _add_comparison_methods(cls): _freq = None - def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, - periods=None, closed=None, dtype=None, copy=False, - name=None, verify_integrity=True): + # ------------------------------------------------------------------- + # Constructors - if isinstance(data, TimedeltaIndex) and freq is None and name is None: - if copy: - return data.copy() - else: - return data._shallow_copy() + def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, + periods=None, closed=None, dtype=_TD_DTYPE, copy=False, + name=None, verify_integrity=None): - freq, freq_infer = dtl.maybe_infer_freq(freq) + if verify_integrity is not None: + warnings.warn("The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, stacklevel=2) + else: + verify_integrity = True if data is None: - # TODO: Remove this block and associated kwargs; GH#20535 - if freq is None and com._any_none(periods, start, end): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') - periods = dtl.validate_periods(periods) - return cls._generate_range(start, end, periods, name, freq, - closed=closed) - - if unit is not None: - data = to_timedelta(data, unit=unit, box=False) + freq, freq_infer = dtl.maybe_infer_freq(freq) + warnings.warn("Creating a TimedeltaIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.timedelta_range` instead.", + FutureWarning, stacklevel=2) + result = cls._generate_range(start, end, periods, freq, + closed=closed) + result.name = name + return result if is_scalar(data): - raise ValueError('TimedeltaIndex() must be called with a ' - 'collection of some kind, {data} was passed' - .format(data=repr(data))) - - # convert if not already - if getattr(data, 'dtype', None) != _TD_DTYPE: - data = to_timedelta(data, unit=unit, box=False) - elif copy: - data = np.array(data, copy=True) + raise TypeError('{cls}() must be called with a ' + 'collection of some kind, {data} was passed' + .format(cls=cls.__name__, data=repr(data))) - subarr = cls._simple_new(data, name=name, freq=freq) - # check that we are matching freqs - if verify_integrity and len(subarr) > 0: - if freq is not None and not freq_infer: - cls._validate_frequency(subarr, freq) + if isinstance(data, TimedeltaIndex) and freq is None and name is None: + if copy: + return data.copy() + else: + return data._shallow_copy() - if freq_infer: - inferred = subarr.inferred_freq - if inferred: - subarr.freq = to_offset(inferred) + # - Cases checked above all return/raise before reaching here - # - return subarr + result = cls._from_sequence(data, freq=freq, unit=unit, + dtype=dtype, copy=copy) + result.name = name + return result @classmethod - def _generate_range(cls, start, end, periods, - name=None, freq=None, closed=None): - # TimedeltaArray gets `name` via **kwargs, so we need to explicitly - # override it if name is passed as a positional argument - return super(TimedeltaIndex, cls)._generate_range(start, end, - periods, freq, - name=name, - closed=closed) + def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): + # `dtype` is passed by _shallow_copy in corner cases, should always + # be timedelta64[ns] if present + assert dtype == _TD_DTYPE - @classmethod - def _simple_new(cls, values, name=None, freq=None, **kwargs): - result = super(TimedeltaIndex, cls)._simple_new(values, freq, **kwargs) + assert isinstance(values, np.ndarray), type(values) + if values.dtype == 'i8': + values = values.view('m8[ns]') + assert values.dtype == 'm8[ns]', values.dtype + + result = super(TimedeltaIndex, cls)._simple_new(values, freq) result.name = name result._reset_identity() return result - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_timedelta64 - return _get_format_timedelta64(self, box=True) + # ------------------------------------------------------------------- def __setstate__(self, state): """Necessary for making this object picklable""" @@ -258,10 +220,13 @@ def _maybe_update_attributes(self, attrs): attrs['freq'] = 'infer' return attrs - def _evaluate_with_timedelta_like(self, other, op): - result = TimedeltaArrayMixin._evaluate_with_timedelta_like(self, other, - op) - return wrap_arithmetic_op(self, other, result) + # ------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): from pandas.io.formats.format import Timedelta64Formatter @@ -269,15 +234,51 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): nat_rep=na_rep, justify='all').get_result() - days = _wrap_field_accessor("days") - seconds = _wrap_field_accessor("seconds") - microseconds = _wrap_field_accessor("microseconds") - nanoseconds = _wrap_field_accessor("nanoseconds") + # ------------------------------------------------------------------- + # Wrapping TimedeltaArray + + __mul__ = _make_wrapped_arith_op("__mul__") + __rmul__ = _make_wrapped_arith_op("__rmul__") + __floordiv__ = _make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__") + __mod__ = _make_wrapped_arith_op("__mod__") + __rmod__ = _make_wrapped_arith_op("__rmod__") + __divmod__ = _make_wrapped_arith_op("__divmod__") + __rdivmod__ = _make_wrapped_arith_op("__rdivmod__") + + days = wrap_field_accessor(TimedeltaArray.days) + seconds = wrap_field_accessor(TimedeltaArray.seconds) + microseconds = wrap_field_accessor(TimedeltaArray.microseconds) + nanoseconds = wrap_field_accessor(TimedeltaArray.nanoseconds) + + total_seconds = wrap_array_method(TimedeltaArray.total_seconds, True) + + def __truediv__(self, other): + oth = other + if isinstance(other, Index): + # TimedeltaArray defers, so we need to unwrap + oth = other._values + result = TimedeltaArray.__truediv__(self, oth) + return wrap_arithmetic_op(self, other, result) + + def __rtruediv__(self, other): + oth = other + if isinstance(other, Index): + # TimedeltaArray defers, so we need to unwrap + oth = other._values + result = TimedeltaArray.__rtruediv__(self, oth) + return wrap_arithmetic_op(self, other, result) + + if compat.PY2: + __div__ = __truediv__ + __rdiv__ = __rtruediv__ - @Appender(TimedeltaArrayMixin.total_seconds.__doc__) - def total_seconds(self): - result = TimedeltaArrayMixin.total_seconds(self) - return Index(result, name=self.name) + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + + # ------------------------------------------------------------------- @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): @@ -286,7 +287,8 @@ def astype(self, dtype, copy=True): # return an index (essentially this is division) result = self.values.astype(dtype, copy=copy) if self.hasnans: - values = self._maybe_mask_results(result, convert='float64') + values = self._maybe_mask_results(result, fill_value=None, + convert='float64') return Index(values, name=self.name) return Index(result.astype('i8'), name=self.name) return super(TimedeltaIndex, self).astype(dtype, copy=copy) @@ -306,6 +308,10 @@ def union(self, other): y : Index or TimedeltaIndex """ self._assert_can_do_setop(other) + + if len(other) == 0 or self.equals(other) or len(self) == 0: + return super(TimedeltaIndex, self).union(other) + if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) @@ -338,7 +344,7 @@ def join(self, other, how='left', level=None, return_indexers=False, sort=sort) def _wrap_joined_index(self, joined, other): - name = self.name if self.name == other.name else None + name = get_op_result_name(self, other) if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and self._can_fast_union(other)): joined = self._shallow_copy(joined, name=name) @@ -398,10 +404,6 @@ def _fast_union(self, other): else: return left - def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None - return self._simple_new(result, name=name, freq=None) - def intersection(self, other): """ Specialized intersection for TimedeltaIndex objects. May be much faster @@ -416,6 +418,10 @@ def intersection(self, other): y : Index or TimedeltaIndex """ self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) @@ -553,16 +559,13 @@ def _maybe_cast_slice_bound(self, label, side, kind): return label - def _get_string_slice(self, key, use_lhs=True, use_rhs=True): - freq = getattr(self, 'freqstr', - getattr(self, 'inferred_freq', None)) + def _get_string_slice(self, key): if is_integer(key) or is_float(key) or key is NaT: self._invalid_indexer('slice', key) - loc = self._partial_td_slice(key, freq, use_lhs=use_lhs, - use_rhs=use_rhs) + loc = self._partial_td_slice(key) return loc - def _partial_td_slice(self, key, freq, use_lhs=True, use_rhs=True): + def _partial_td_slice(self, key): # given a key, try to figure out a location for a partial slice if not isinstance(key, compat.string_types): @@ -570,43 +573,6 @@ def _partial_td_slice(self, key, freq, use_lhs=True, use_rhs=True): raise NotImplementedError - # TODO(wesm): dead code - # parsed = _coerce_scalar_to_timedelta_type(key, box=True) - - # is_monotonic = self.is_monotonic - - # # figure out the resolution of the passed td - # # and round to it - - # # t1 = parsed.round(reso) - - # t2 = t1 + to_offset(parsed.resolution) - Timedelta(1, 'ns') - - # stamps = self.asi8 - - # if is_monotonic: - - # # we are out of range - # if (len(stamps) and ((use_lhs and t1.value < stamps[0] and - # t2.value < stamps[0]) or - # ((use_rhs and t1.value > stamps[-1] and - # t2.value > stamps[-1])))): - # raise KeyError - - # # a monotonic (sorted) series can be sliced - # left = (stamps.searchsorted(t1.value, side='left') - # if use_lhs else None) - # right = (stamps.searchsorted(t2.value, side='right') - # if use_rhs else None) - - # return slice(left, right) - - # lhs_mask = (stamps >= t1.value) if use_lhs else True - # rhs_mask = (stamps <= t2.value) if use_rhs else True - - # # try to find a the dates - # return (lhs_mask & rhs_mask).nonzero()[0] - @Substitution(klass='TimedeltaIndex') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): @@ -668,7 +634,7 @@ def insert(self, loc, item): try: new_tds = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) - return TimedeltaIndex(new_tds, name=self.name, freq=freq) + return self._shallow_copy(new_tds, freq=freq) except (AttributeError, TypeError): @@ -708,8 +674,8 @@ def delete(self, loc): return TimedeltaIndex(new_tds, name=self.name, freq=freq) -TimedeltaIndex._add_comparison_methods() -TimedeltaIndex._add_numeric_methods() +TimedeltaIndex._add_comparison_ops() +TimedeltaIndex._add_numeric_methods_unary() TimedeltaIndex._add_logical_methods_disabled() TimedeltaIndex._add_datetimelike_methods() @@ -797,5 +763,8 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, if freq is None and com._any_none(periods, start, end): freq = 'D' - return TimedeltaIndex(start=start, end=end, periods=periods, - freq=freq, name=name, closed=closed) + freq, freq_infer = dtl.maybe_infer_freq(freq) + result = TimedeltaIndex._generate_range(start, end, periods, freq, + closed=closed) + result.name = name + return result diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 150518aadcfd9..0914324a03f84 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,27 +1,23 @@ # pylint: disable=W0223 import textwrap import warnings + import numpy as np -from pandas.compat import range, zip + +from pandas._libs.indexing import _NDFrameIndexerBase import pandas.compat as compat -from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries -from pandas.core.dtypes.common import ( - is_integer_dtype, - is_integer, is_float, - is_list_like, - is_sequence, - is_iterator, - is_scalar, - is_sparse, - ensure_platform_int) -from pandas.core.dtypes.missing import isna, _infer_fill_value +from pandas.compat import range, zip from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender -from pandas.core.index import Index, MultiIndex +from pandas.core.dtypes.common import ( + ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, + is_list_like, is_scalar, is_sequence, is_sparse) +from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries +from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas._libs.indexing import _NDFrameIndexerBase +from pandas.core.index import Index, MultiIndex # the supported indexers @@ -304,8 +300,7 @@ def _setitem_with_indexer(self, indexer, value): self._has_valid_setitem_indexer(indexer) # also has the side effect of consolidating in-place - # TODO: Panel, DataFrame are not imported, remove? - from pandas import Panel, DataFrame, Series # noqa + from pandas import Series info_axis = self.obj._info_axis_number # maybe partial set @@ -553,14 +548,14 @@ def can_do_equal_len(): is_scalar(plane_indexer[0])): return False - l = len(value) item = labels[0] index = self.obj[item].index + values_len = len(value) # equal len list/ndarray - if len(index) == l: + if len(index) == values_len: return True - elif lplane_indexer == l: + elif lplane_indexer == values_len: return True return False @@ -717,8 +712,8 @@ def ravel(i): # single indexer if len(indexer) > 1 and not multiindex_indexer: - l = len(indexer[1]) - ser = np.tile(ser, l).reshape(l, -1).T + len_indexer = len(indexer[1]) + ser = np.tile(ser, len_indexer).reshape(len_indexer, -1).T return ser @@ -1403,7 +1398,6 @@ class _IXIndexer(_NDFrameIndexer): usually better to be explicit and use ``.iloc`` or ``.loc``. See more at :ref:`Advanced Indexing `. - """ def __init__(self, name, obj): @@ -1567,11 +1561,11 @@ class _LocIndexer(_LocationIndexer): See Also -------- - DataFrame.at : Access a single value for a row/column label pair - DataFrame.iloc : Access group of rows and columns by integer position(s) + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.iloc : Access group of rows and columns by integer position(s). DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the Series/DataFrame. - Series.loc : Access group of values using labels + Series.loc : Access group of values using labels. Examples -------- @@ -1834,8 +1828,8 @@ def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" if isinstance(labels, MultiIndex): - if isinstance(key, compat.string_types) and \ - labels.levels[0].is_all_dates: + if (isinstance(key, compat.string_types) and + labels.levels[0].is_all_dates): # Convert key '2016-01-01' to # ('2016-01-01'[, slice(None, None, None)]+) key = tuple([key] + [slice(None)] * (len(labels.levels) - 1)) @@ -1845,8 +1839,8 @@ def _get_partial_string_timestamp_match_key(self, key, labels): # (..., slice('2016-01-01', '2016-01-01', None), ...) new_key = [] for i, component in enumerate(key): - if isinstance(component, compat.string_types) and \ - labels.levels[i].is_all_dates: + if (isinstance(component, compat.string_types) and + labels.levels[i].is_all_dates): new_key.append(slice(component, component, None)) else: new_key.append(component) @@ -2077,9 +2071,9 @@ def _validate_key(self, key, axis): elif is_list_like_indexer(key): # check that the key does not exceed the maximum size of the index arr = np.array(key) - l = len(self.obj._get_axis(axis)) + len_axis = len(self.obj._get_axis(axis)) - if len(arr) and (arr.max() >= l or arr.min() < -l): + if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): raise IndexError("positional indexers are out-of-bounds") else: raise ValueError("Can only index by location with " @@ -2136,9 +2130,8 @@ def _validate_integer(self, key, axis): If 'key' is not a valid position in axis 'axis' """ - ax = self.obj._get_axis(axis) - l = len(ax) - if key >= l or key < -l: + len_axis = len(self.obj._get_axis(axis)) + if key >= len_axis or key < -len_axis: raise IndexError("single positional indexer is out-of-bounds") def _getitem_tuple(self, tup): @@ -2301,9 +2294,9 @@ class _AtIndexer(_ScalarAccessIndexer): See Also -------- DataFrame.iat : Access a single value for a row/column pair by integer - position - DataFrame.loc : Access a group of rows and columns by label(s) - Series.at : Access a single value using a label + position. + DataFrame.loc : Access a group of rows and columns by label(s). + Series.at : Access a single value using a label. Examples -------- @@ -2371,9 +2364,9 @@ class _iAtIndexer(_ScalarAccessIndexer): See Also -------- - DataFrame.at : Access a single value for a row/column label pair - DataFrame.loc : Access a group of rows and columns by label(s) - DataFrame.iloc : Access a group of rows and columns by integer position(s) + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer position(s). Examples -------- @@ -2422,21 +2415,22 @@ def _convert_key(self, key, is_setter=False): def length_of_indexer(indexer, target=None): - """return the length of a single non-tuple indexer which could be a slice + """ + return the length of a single non-tuple indexer which could be a slice """ if target is not None and isinstance(indexer, slice): - l = len(target) + target_len = len(target) start = indexer.start stop = indexer.stop step = indexer.step if start is None: start = 0 elif start < 0: - start += l - if stop is None or stop > l: - stop = l + start += target_len + if stop is None or stop > target_len: + stop = target_len elif stop < 0: - stop += l + stop += target_len if step is None: step = 1 elif step < 0: @@ -2450,7 +2444,8 @@ def length_of_indexer(indexer, target=None): def convert_to_index_sliceable(obj, key): - """if we are index sliceable, then return my slicer, otherwise return None + """ + if we are index sliceable, then return my slicer, otherwise return None """ idx = obj.index if isinstance(key, slice): @@ -2500,7 +2495,8 @@ def check_bool_indexer(ax, key): def check_setitem_lengths(indexer, value, values): - """Validate that value and indexer are the same length. + """ + Validate that value and indexer are the same length. An special-case is allowed for when the indexer is a boolean array and the number of true values equals the length of ``value``. In @@ -2543,7 +2539,8 @@ def check_setitem_lengths(indexer, value, values): def convert_missing_indexer(indexer): - """ reverse convert a missing indexer, which is a dict + """ + reverse convert a missing indexer, which is a dict return the scalar indexer and a boolean indicating if we converted """ @@ -2560,7 +2557,9 @@ def convert_missing_indexer(indexer): def convert_from_missing_indexer_tuple(indexer, axes): - """ create a filtered indexer that doesn't have any missing indexers """ + """ + create a filtered indexer that doesn't have any missing indexers + """ def get_indexer(_i, _idx): return (axes[_i].get_loc(_idx['key']) if isinstance(_idx, dict) else @@ -2614,7 +2613,8 @@ def maybe_convert_indices(indices, n): def validate_indices(indices, n): - """Perform bounds-checking for an indexer. + """ + Perform bounds-checking for an indexer. -1 is allowed for indicating missing values. diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 22caa577c2891..7d6aa6a42efc2 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -5,7 +5,7 @@ make_block, # io.pytables, io.packers FloatBlock, IntBlock, ComplexBlock, BoolBlock, ObjectBlock, TimeDeltaBlock, DatetimeBlock, DatetimeTZBlock, - CategoricalBlock, ExtensionBlock, SparseBlock, ScalarBlock, + CategoricalBlock, ExtensionBlock, ScalarBlock, Block) from .managers import ( # noqa:F401 BlockManager, SingleBlockManager, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0e57dd33b1c4e..9c2d4cd5729d2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,75 +1,45 @@ # -*- coding: utf-8 -*- -import warnings +from datetime import date, datetime, timedelta +import functools import inspect import re -from datetime import datetime, timedelta, date +import warnings import numpy as np -from pandas._libs import lib, tslib, tslibs, internals as libinternals -from pandas._libs.tslibs import conversion, Timedelta - -from pandas import compat +from pandas._libs import internals as libinternals, lib, tslib, tslibs +from pandas._libs.tslibs import Timedelta, conversion +import pandas.compat as compat from pandas.compat import range, zip - from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, DatetimeTZDtype, - PandasExtensionDtype, - CategoricalDtype) -from pandas.core.dtypes.common import ( - _TD_DTYPE, _NS_DTYPE, - ensure_platform_int, - is_integer, - is_dtype_equal, - is_timedelta64_dtype, - is_datetime64_dtype, is_datetimetz, is_sparse, - is_categorical, is_categorical_dtype, - is_integer_dtype, - is_datetime64tz_dtype, - is_bool_dtype, - is_object_dtype, - is_float_dtype, - is_numeric_v_string_like, is_extension_type, - is_extension_array_dtype, - is_list_like, - is_re, - is_re_compilable, - pandas_dtype) from pandas.core.dtypes.cast import ( - maybe_downcast_to_dtype, - maybe_upcast, - maybe_promote, - infer_dtype_from, - infer_dtype_from_scalar, - soft_convert_objects, - maybe_convert_objects, - astype_nansafe, - find_common_type, - maybe_infer_dtype_type) -from pandas.core.dtypes.missing import ( - isna, notna, array_equivalent, - _isna_compat, - is_null_datelike_scalar) + astype_nansafe, find_common_type, infer_dtype_from, + infer_dtype_from_scalar, maybe_convert_objects, maybe_downcast_to_dtype, + maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects) +from pandas.core.dtypes.common import ( + _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical, + is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_dtype_equal, is_extension_array_dtype, is_extension_type, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, + is_numeric_v_string_like, is_object_dtype, is_re, is_re_compilable, + is_sparse, is_timedelta64_dtype, pandas_dtype) import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.generic import ( - ABCSeries, - ABCDatetimeIndex, - ABCExtensionArray, - ABCIndexClass) + ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, ABCSeries) +from pandas.core.dtypes.missing import ( + _isna_compat, array_equivalent, is_null_datelike_scalar, isna, notna) -import pandas.core.common as com import pandas.core.algorithms as algos -import pandas.core.missing as missing +from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.base import PandasObject - -from pandas.core.arrays import Categorical -from pandas.core.sparse.array import SparseArray - +import pandas.core.common as com from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_setitem_lengths +import pandas.core.missing as missing from pandas.io.formats.printing import pprint_thing @@ -300,7 +270,7 @@ def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] - def reshape_nd(self, labels, shape, ref_items, mgr=None): + def reshape_nd(self, labels, shape, ref_items): """ Parameters ---------- @@ -344,7 +314,11 @@ def dtype(self): @property def ftype(self): - return "{dtype}:{ftype}".format(dtype=self.dtype, ftype=self._ftype) + if getattr(self.values, '_pandas_ftype', False): + dtype = self.dtype.subtype + else: + dtype = self.dtype + return "{dtype}:{ftype}".format(dtype=dtype, ftype=self._ftype) def merge(self, other): return _merge_blocks([self, other]) @@ -378,7 +352,7 @@ def delete(self, loc): self.values = np.delete(self.values, loc, 0) self.mgr_locs = self.mgr_locs.delete(loc) - def apply(self, func, mgr=None, **kwargs): + def apply(self, func, **kwargs): """ apply the function to my values; return a block if we are not one """ @@ -390,8 +364,7 @@ def apply(self, func, mgr=None, **kwargs): return result - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): + def fillna(self, value, limit=None, inplace=False, downcast=None): """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ @@ -416,7 +389,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, # fillna, but if we cannot coerce, then try again as an ObjectBlock try: - values, _, _, _ = self._try_coerce_args(self.values, value) + values, _ = self._try_coerce_args(self.values, value) blocks = self.putmask(mask, value, inplace=inplace) blocks = [b.make_block(values=self._try_coerce_result(b.values)) for b in blocks] @@ -517,7 +490,7 @@ def _maybe_downcast(self, blocks, downcast=None): blocks = [blocks] return _extend_blocks([b.downcast(downcast) for b in blocks]) - def downcast(self, dtypes=None, mgr=None): + def downcast(self, dtypes=None): """ try to downcast each item to the dict of dtypes if present """ # turn it off completely @@ -564,7 +537,7 @@ def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): **kwargs) def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None, **kwargs): + klass=None, **kwargs): """Coerce to the new type Parameters @@ -623,7 +596,6 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, # convert dtypes if needed dtype = pandas_dtype(dtype) - # astype processing if is_dtype_equal(self.dtype, dtype): if copy: @@ -631,8 +603,14 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self if klass is None: - if dtype == np.object_: + if is_sparse(self.values): + # special case sparse, Series[Sparse].astype(object) is sparse + klass = ExtensionBlock + elif is_object_dtype(dtype): klass = ObjectBlock + elif is_extension_array_dtype(dtype): + klass = ExtensionBlock + try: # force the copy here if values is None: @@ -675,11 +653,11 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, if newb.shape != self.shape: raise TypeError( "cannot set astype for copy = [{copy}] for dtype " - "({dtype} [{itemsize}]) with smaller itemsize than " - "current ({newb_dtype} [{newb_size}])".format( + "({dtype} [{shape}]) to different shape " + "({newb_dtype} [{newb_shape}])".format( copy=copy, dtype=self.dtype.name, - itemsize=self.itemsize, newb_dtype=newb.dtype.name, - newb_size=newb.itemsize)) + shape=self.shape, newb_dtype=newb.dtype.name, + newb_shape=newb.shape)) return newb def convert(self, copy=True, **kwargs): @@ -741,7 +719,7 @@ def _try_coerce_args(self, values, other): type(other).__name__, type(self).__name__.lower().replace('Block', ''))) - return values, False, other, False + return values, other def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -771,7 +749,7 @@ def to_native_types(self, slicer=None, na_rep='nan', quoting=None, return values # block actions #### - def copy(self, deep=True, mgr=None): + def copy(self, deep=True): """ copy constructor """ values = self.values if deep: @@ -779,7 +757,7 @@ def copy(self, deep=True, mgr=None): return self.make_block_same_class(values) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): + regex=False, convert=True): """replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. regex is not used here. It is used in ObjectBlocks. It is here for API compatibility. @@ -791,8 +769,8 @@ def replace(self, to_replace, value, inplace=False, filter=None, # try to replace, if we raise an error, convert to ObjectBlock and # retry try: - values, _, to_replace, _ = self._try_coerce_args(self.values, - to_replace) + values, to_replace = self._try_coerce_args(self.values, + to_replace) mask = missing.mask_missing(values, to_replace) if filter is not None: filtered_out = ~self.mgr_locs.isin(filter) @@ -822,7 +800,7 @@ def _replace_single(self, *args, **kwargs): """ no-op on a non-ObjectBlock """ return self if kwargs['inplace'] else self.copy() - def setitem(self, indexer, value, mgr=None): + def setitem(self, indexer, value): """Set the value inplace, returning a a maybe different typed block. Parameters @@ -831,7 +809,6 @@ def setitem(self, indexer, value, mgr=None): The subset of self.values to set value : object The value being set - mgr : BlockPlacement, optional Returns ------- @@ -850,7 +827,7 @@ def setitem(self, indexer, value, mgr=None): # coerce if block dtype can store value values = self.values try: - values, _, value, _ = self._try_coerce_args(values, value) + values, value = self._try_coerce_args(values, value) # can keep its own dtype if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, value.dtype): @@ -881,7 +858,7 @@ def setitem(self, indexer, value, mgr=None): dtype = find_common_type([values.dtype, dtype]) if not is_dtype_equal(self.dtype, dtype): b = self.astype(dtype) - return b.setitem(indexer, value, mgr=mgr) + return b.setitem(indexer, value) # value must be storeable at this moment arr_value = np.array(value) @@ -951,7 +928,7 @@ def _is_empty_indexer(indexer): return block def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False, mgr=None): + transpose=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -982,7 +959,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new = self.fill_value if self._can_hold_element(new): - _, _, new, _ = self._try_coerce_args(new_values, new) + _, new = self._try_coerce_args(new_values, new) if transpose: new_values = new_values.T @@ -1126,7 +1103,7 @@ def coerce_to_target_dtype(self, other): def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, limit_direction='forward', limit_area=None, fill_value=None, coerce=False, - downcast=None, mgr=None, **kwargs): + downcast=None, **kwargs): inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1153,7 +1130,7 @@ def check_int_bool(self, inplace): inplace=inplace, limit=limit, fill_value=fill_value, coerce=coerce, - downcast=downcast, mgr=mgr) + downcast=downcast) # try an interp method try: m = missing.clean_interp_method(method, **kwargs) @@ -1169,13 +1146,13 @@ def check_int_bool(self, inplace): limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, inplace=inplace, - downcast=downcast, mgr=mgr, **kwargs) + downcast=downcast, **kwargs) raise ValueError("invalid method '{0}' to interpolate.".format(method)) def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, coerce=False, - downcast=None, mgr=None): + downcast=None): """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1190,7 +1167,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, return [self.copy()] values = self.values if inplace else self.values.copy() - values, _, fill_value, _ = self._try_coerce_args(values, fill_value) + values, fill_value = self._try_coerce_args(values, fill_value) values = missing.interpolate_2d(values, method=method, axis=axis, limit=limit, fill_value=fill_value, dtype=self.dtype) @@ -1202,7 +1179,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, def _interpolate(self, method=None, index=None, values=None, fill_value=None, axis=0, limit=None, limit_direction='forward', limit_area=None, - inplace=False, downcast=None, mgr=None, **kwargs): + inplace=False, downcast=None, **kwargs): """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1278,12 +1255,12 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): else: return self.make_block_same_class(new_values, new_mgr_locs) - def diff(self, n, axis=1, mgr=None): + def diff(self, n, axis=1): """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] - def shift(self, periods, axis=0, mgr=None): + def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than @@ -1313,147 +1290,8 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block(new_values)] - def eval(self, func, other, errors='raise', try_cast=False, mgr=None): - """ - evaluate the block; return result block from the result - - Parameters - ---------- - func : how to combine self, other - other : a ndarray/object - errors : str, {'raise', 'ignore'}, default 'raise' - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object - - try_cast : try casting the results to the input type - - Returns - ------- - a new block, the result of the func - """ - orig_other = other - values = self.values - - other = getattr(other, 'values', other) - - # make sure that we can broadcast - is_transposed = False - if hasattr(other, 'ndim') and hasattr(values, 'ndim'): - if values.ndim != other.ndim: - is_transposed = True - else: - if values.shape == other.shape[::-1]: - is_transposed = True - elif values.shape[0] == other.shape[-1]: - is_transposed = True - else: - # this is a broadcast error heree - raise ValueError( - "cannot broadcast shape [{t_shape}] with " - "block values [{oth_shape}]".format( - t_shape=values.T.shape, oth_shape=other.shape)) - - transf = (lambda x: x.T) if is_transposed else (lambda x: x) - - # coerce/transpose the args if needed - try: - values, values_mask, other, other_mask = self._try_coerce_args( - transf(values), other) - except TypeError: - block = self.coerce_to_target_dtype(orig_other) - return block.eval(func, orig_other, - errors=errors, - try_cast=try_cast, mgr=mgr) - - # get the result, may need to transpose the other - def get_result(other): - - # avoid numpy warning of comparisons again None - if other is None: - result = not func.__name__ == 'eq' - - # avoid numpy warning of elementwise comparisons to object - elif is_numeric_v_string_like(values, other): - result = False - - # avoid numpy warning of elementwise comparisons - elif func.__name__ == 'eq': - if is_list_like(other) and not isinstance(other, np.ndarray): - other = np.asarray(other) - - # if we can broadcast, then ok - if values.shape[-1] != other.shape[-1]: - return False - result = func(values, other) - else: - result = func(values, other) - - # mask if needed - if isinstance(values_mask, np.ndarray) and values_mask.any(): - result = result.astype('float64', copy=False) - result[values_mask] = np.nan - if other_mask is True: - result = result.astype('float64', copy=False) - result[:] = np.nan - elif isinstance(other_mask, np.ndarray) and other_mask.any(): - result = result.astype('float64', copy=False) - result[other_mask.ravel()] = np.nan - - return result - - # error handler if we have an issue operating with the function - def handle_error(): - - if errors == 'raise': - # The 'detail' variable is defined in outer scope. - raise TypeError( - 'Could not operate {other!r} with block values ' - '{detail!s}'.format(other=other, detail=detail)) # noqa - else: - # return the values - result = np.empty(values.shape, dtype='O') - result.fill(np.nan) - return result - - # get the result - try: - with np.errstate(all='ignore'): - result = get_result(other) - - # if we have an invalid shape/broadcast error - # GH4576, so raise instead of allowing to pass through - except ValueError as detail: - raise - except Exception as detail: - result = handle_error() - - # technically a broadcast error in numpy can 'work' by returning a - # boolean False - if not isinstance(result, np.ndarray): - if not isinstance(result, np.ndarray): - - # differentiate between an invalid ndarray-ndarray comparison - # and an invalid type comparison - if isinstance(values, np.ndarray) and is_list_like(other): - raise ValueError( - 'Invalid broadcasting comparison [{other!r}] with ' - 'block values'.format(other=other)) - - raise TypeError('Could not compare [{other!r}] ' - 'with block values'.format(other=other)) - - # transpose if needed - result = transf(result) - - # try to cast if requested - if try_cast: - result = self._try_cast_result(result) - - result = _block_shape(result, ndim=self.ndim) - return [self.make_block(result)] - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False, mgr=None): + try_cast=False, axis=0, transpose=False): """ evaluate the block; return result block(s) from the result @@ -1502,8 +1340,7 @@ def func(cond, values, other): if cond.ravel().all(): return values - values, values_mask, other, other_mask = self._try_coerce_args( - values, other) + values, other = self._try_coerce_args(values, other) try: return self._try_coerce_result(expressions.where( @@ -1566,7 +1403,7 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) - def _unstack(self, unstacker_func, new_columns): + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): """Return a list of unstacked blocks of self Parameters @@ -1575,6 +1412,10 @@ def _unstack(self, unstacker_func, new_columns): Partially applied unstacker. new_columns : Index All columns of the unstacked BlockManager. + n_rows : int + Only used in ExtensionBlock.unstack + fill_value : int + Only used in ExtensionBlock.unstack Returns ------- @@ -1595,7 +1436,7 @@ def _unstack(self, unstacker_func, new_columns): blocks = [make_block(new_values, placement=new_placement)] return blocks, mask - def quantile(self, qs, interpolation='linear', axis=0, mgr=None): + def quantile(self, qs, interpolation='linear', axis=0, axes=None): """ compute the quantiles of the @@ -1604,6 +1445,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): qs: a scalar or list of the quantiles to be computed interpolation: type of interpolation, default 'linear' axis: axis to compute, default 0 + axes : BlockManager.axes Returns ------- @@ -1612,9 +1454,10 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): """ kw = {'interpolation': interpolation} values = self.get_values() - values, _, _, _ = self._try_coerce_args(values, values) + values, _ = self._try_coerce_args(values, values) def _nanpercentile1D(values, mask, q, **kw): + # mask is Union[ExtensionArray, ndarray] values = values[~mask] if len(values) == 0: @@ -1680,7 +1523,7 @@ def _nanpercentile(values, q, axis, **kw): if self.ndim == 1: ax = Float64Index([qs]) else: - ax = mgr.axes[0] + ax = axes[0] if is_empty: if self.ndim == 1: @@ -1699,7 +1542,7 @@ def _nanpercentile(values, q, axis, **kw): ndim=ndim) def _replace_coerce(self, to_replace, value, inplace=True, regex=False, - convert=False, mgr=None, mask=None): + convert=False, mask=None): """ Replace value corresponding to the given boolean array with another value. @@ -1716,7 +1559,6 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, If true, perform regular expression substitution. convert : bool, default True If true, try to coerce any object types to better types. - mgr : BlockManager, optional mask : array-like of bool, optional True indicate corresponding element is ignored. @@ -1733,8 +1575,7 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, return self._replace_single(to_replace, value, inplace=inplace, regex=regex, convert=convert, - mask=mask, - mgr=mgr) + mask=mask) return self @@ -1822,7 +1663,7 @@ def set(self, locs, values, check=False): self.values = values def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False, mgr=None): + transpose=False): """ putmask the data to the block; we must be a single block and not generate other blocks @@ -1845,7 +1686,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, # use block's copy logic. # .values may be an Index which does shallow copy by default new_values = self.values if inplace else self.copy().values - new_values, _, new, _ = self._try_coerce_args(new_values, new) + new_values, new = self._try_coerce_args(new_values, new) if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -1863,7 +1704,7 @@ def _slice(self, slicer): def _try_cast_result(self, result, dtype=None): return result - def _unstack(self, unstacker_func, new_columns): + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): """Return a list of unstacked blocks of self Parameters @@ -1872,6 +1713,10 @@ def _unstack(self, unstacker_func, new_columns): Partially applied unstacker. new_columns : Index All columns of the unstacked BlockManager. + n_rows : int + Only used in ExtensionBlock.unstack + fill_value : int + Only used in ExtensionBlock.unstack Returns ------- @@ -1883,11 +1728,11 @@ def _unstack(self, unstacker_func, new_columns): # NonConsolidatable blocks can have a single item only, so we return # one block per item unstacker = unstacker_func(self.values.T) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - mask = mask.any(0) + new_placement, new_values, mask = self._get_unstack_items( + unstacker, new_columns + ) + new_values = new_values.T[mask] new_placement = new_placement[mask] @@ -1895,6 +1740,38 @@ def _unstack(self, unstacker_func, new_columns): for vals, place in zip(new_values, new_placement)] return blocks, mask + def _get_unstack_items(self, unstacker, new_columns): + """ + Get the placement, values, and mask for a Block unstack. + + This is shared between ObjectBlock and ExtensionBlock. They + differ in that ObjectBlock passes the values, while ExtensionBlock + passes the dummy ndarray of positions to be used by a take + later. + + Parameters + ---------- + unstacker : pandas.core.reshape.reshape._Unstacker + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + new_placement : ndarray[int] + The placement of the new columns in `new_columns`. + new_values : Union[ndarray, ExtensionArray] + The first return value from _Unstacker.get_new_values. + mask : ndarray[bool] + The second return value from _Unstacker.get_new_values. + """ + # shared with ExtensionBlock + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + return new_placement, new_values, mask + class ExtensionBlock(NonConsolidatableMixIn, Block): """Block for holding extension types. @@ -1954,7 +1831,7 @@ def is_view(self): def is_numeric(self): return self.values.dtype._is_numeric - def setitem(self, indexer, value, mgr=None): + def setitem(self, indexer, value): """Set the value inplace, returning a same-typed block. This differs from Block.setitem by not allowing setitem to change @@ -1966,7 +1843,6 @@ def setitem(self, indexer, value, mgr=None): The subset of self.values to set value : object The value being set - mgr : BlockPlacement, optional Returns ------- @@ -2039,7 +1915,19 @@ def _slice(self, slicer): return self.values[slicer] def formatting_values(self): - return self.values._formatting_values() + # Deprecating the ability to override _formatting_values. + # Do the warning here, it's only user in pandas, since we + # have to check if the subclass overrode it. + fv = getattr(type(self.values), '_formatting_values', None) + if fv and fv != ExtensionArray._formatting_values: + msg = ( + "'ExtensionArray._formatting_values' is deprecated. " + "Specify 'ExtensionArray._formatter' instead." + ) + warnings.warn(msg, DeprecationWarning, stacklevel=10) + return self.values._formatting_values() + + return self.values def concat_same_type(self, to_concat, placement=None): """ @@ -2051,8 +1939,7 @@ def concat_same_type(self, to_concat, placement=None): return self.make_block_same_class(values, ndim=self.ndim, placement=placement) - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): + def fillna(self, value, limit=None, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() values = values.fillna(value=value, limit=limit) return [self.make_block_same_class(values=values, @@ -2068,7 +1955,7 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None, limit=limit), placement=self.mgr_locs) - def shift(self, periods, axis=0, mgr=None): + def shift(self, periods, axis=0): """ Shift the block by `periods`. @@ -2080,6 +1967,34 @@ def shift(self, periods, axis=0, mgr=None): placement=self.mgr_locs, ndim=self.ndim)] + @property + def _ftype(self): + return getattr(self.values, '_pandas_ftype', Block._ftype) + + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + # ExtensionArray-safe unstack. + # We override ObjectBlock._unstack, which unstacks directly on the + # values of the array. For EA-backed blocks, this would require + # converting to a 2-D ndarray of objects. + # Instead, we unstack an ndarray of integer positions, followed by + # a `take` on the actual values. + dummy_arr = np.arange(n_rows) + dummy_unstacker = functools.partial(unstacker_func, fill_value=-1) + unstacker = dummy_unstacker(dummy_arr) + + new_placement, new_values, mask = self._get_unstack_items( + unstacker, new_columns + ) + + blocks = [ + self.make_block_same_class( + self.values.take(indices, allow_fill=True, + fill_value=fill_value), + [place]) + for indices, place in zip(new_values.T, new_placement) + ] + return blocks, mask + class NumericBlock(Block): __slots__ = () @@ -2234,9 +2149,9 @@ def _box_func(self): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return issubclass(tipo.type, np.timedelta64) + return issubclass(tipo.type, (np.timedelta64, np.int64)) return is_integer(element) or isinstance( - element, (timedelta, np.timedelta64)) + element, (timedelta, np.timedelta64, np.int64)) def fillna(self, value, **kwargs): @@ -2258,35 +2173,28 @@ def _try_coerce_args(self, values, other): Returns ------- - base-type values, values mask, base-type other, other mask + base-type values, base-type other """ - - values_mask = isna(values) values = values.view('i8') - other_mask = False if isinstance(other, bool): raise TypeError elif is_null_datelike_scalar(other): other = tslibs.iNaT - other_mask = True elif isinstance(other, Timedelta): - other_mask = isna(other) other = other.value elif isinstance(other, timedelta): other = Timedelta(other).value elif isinstance(other, np.timedelta64): - other_mask = isna(other) other = Timedelta(other).value elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): - other_mask = isna(other) other = other.astype('i8', copy=False).view('i8') else: # coercion issues # let higher levels handle raise TypeError - return values, values_mask, other, other_mask + return values, other def _try_coerce_result(self, result): """ reverse of try_coerce_args / try_operate """ @@ -2300,7 +2208,8 @@ def _try_coerce_result(self, result): return result def should_store(self, value): - return issubclass(value.dtype.type, np.timedelta64) + return (issubclass(value.dtype.type, np.timedelta64) and + not is_extension_array_dtype(value)) def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): @@ -2339,18 +2248,18 @@ def _can_hold_element(self, element): return isinstance(element, (bool, np.bool_)) def should_store(self, value): - return issubclass(value.dtype.type, np.bool_) + return (issubclass(value.dtype.type, np.bool_) and not + is_extension_array_dtype(value)) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): + regex=False, convert=True): inplace = validate_bool_kwarg(inplace, 'inplace') to_replace_values = np.atleast_1d(to_replace) if not np.can_cast(to_replace_values, bool): return self return super(BoolBlock, self).replace(to_replace, value, inplace=inplace, filter=filter, - regex=regex, convert=convert, - mgr=mgr) + regex=regex, convert=convert) class ObjectBlock(Block): @@ -2398,10 +2307,7 @@ def convert(self, *args, **kwargs): 'convert_timedeltas'] fn_inputs += ['copy'] - fn_kwargs = {} - for key in fn_inputs: - if key in kwargs: - fn_kwargs[key] = kwargs[key] + fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} # operate column-by-column def f(m, v, i): @@ -2471,7 +2377,7 @@ def _try_coerce_args(self, values, other): # to store DatetimeTZBlock as object other = other.astype(object).values - return values, False, other, False + return values, other def should_store(self, value): return not (issubclass(value.dtype.type, @@ -2483,7 +2389,7 @@ def should_store(self, value): is_extension_array_dtype(value)) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): + regex=False, convert=True): to_rep_is_list = is_list_like(to_replace) value_is_list = is_list_like(value) both_lists = to_rep_is_list and value_is_list @@ -2495,19 +2401,19 @@ def replace(self, to_replace, value, inplace=False, filter=None, if not either_list and is_re(to_replace): return self._replace_single(to_replace, value, inplace=inplace, filter=filter, regex=True, - convert=convert, mgr=mgr) + convert=convert) elif not (either_list or regex): return super(ObjectBlock, self).replace(to_replace, value, inplace=inplace, filter=filter, regex=regex, - convert=convert, mgr=mgr) + convert=convert) elif both_lists: for to_rep, v in zip(to_replace, value): result_blocks = [] for b in blocks: result = b._replace_single(to_rep, v, inplace=inplace, filter=filter, regex=regex, - convert=convert, mgr=mgr) + convert=convert) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks @@ -2518,17 +2424,17 @@ def replace(self, to_replace, value, inplace=False, filter=None, for b in blocks: result = b._replace_single(to_rep, value, inplace=inplace, filter=filter, regex=regex, - convert=convert, mgr=mgr) + convert=convert) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks return self._replace_single(to_replace, value, inplace=inplace, filter=filter, convert=convert, - regex=regex, mgr=mgr) + regex=regex) def _replace_single(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None, mask=None): + regex=False, convert=True, mask=None): """ Replace elements by the given value. @@ -2545,7 +2451,6 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, If true, perform regular expression substitution. convert : bool, default True If true, try to coerce any object types to better types. - mgr : BlockManager, optional mask : array-like of bool, optional True indicate corresponding element is ignored. @@ -2588,8 +2493,7 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, # the superclass method -> to_replace is some kind of object return super(ObjectBlock, self).replace(to_replace, value, inplace=inplace, - filter=filter, regex=regex, - mgr=mgr) + filter=filter, regex=regex) new_values = self.values if inplace else self.values.copy() @@ -2630,7 +2534,7 @@ def re_replacer(s): return block def _replace_coerce(self, to_replace, value, inplace=True, regex=False, - convert=False, mgr=None, mask=None): + convert=False, mask=None): """ Replace value corresponding to the given boolean array with another value. @@ -2647,7 +2551,6 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, If true, perform regular expression substitution. convert : bool, default True If true, try to coerce any object types to better types. - mgr : BlockManager, optional mask : array-like of bool, optional True indicate corresponding element is ignored. @@ -2658,7 +2561,7 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, if mask.any(): block = super(ObjectBlock, self)._replace_coerce( to_replace=to_replace, value=value, inplace=inplace, - regex=regex, convert=convert, mgr=mgr, mask=mask) + regex=regex, convert=convert, mask=mask) if convert: block = [b.convert(by_item=True, numeric=False, copy=True) for b in block] @@ -2773,16 +2676,15 @@ def _maybe_coerce_values(self, values): values = conversion.ensure_datetime64ns(values) return values - def _astype(self, dtype, mgr=None, **kwargs): + def _astype(self, dtype, **kwargs): """ these automatically copy, so copy=True has no effect raise on an except if raise == True """ + dtype = pandas_dtype(dtype) # if we are passed a datetime64[ns, tz] if is_datetime64tz_dtype(dtype): - dtype = DatetimeTZDtype(dtype) - values = self.values if getattr(values, 'tz', None) is None: values = DatetimeIndex(values).tz_localize('UTC') @@ -2795,9 +2697,7 @@ def _astype(self, dtype, mgr=None, **kwargs): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - # TODO: this still uses asarray, instead of dtype.type - element = np.array(element) - return element.dtype == _NS_DTYPE or element.dtype == np.int64 + return tipo == _NS_DTYPE or tipo == np.int64 return (is_integer(element) or isinstance(element, datetime) or isna(element)) @@ -2815,33 +2715,29 @@ def _try_coerce_args(self, values, other): Returns ------- - base-type values, values mask, base-type other, other mask + base-type values, base-type other """ - values_mask = isna(values) values = values.view('i8') if isinstance(other, bool): raise TypeError elif is_null_datelike_scalar(other): other = tslibs.iNaT - other_mask = True elif isinstance(other, (datetime, np.datetime64, date)): other = self._box_func(other) if getattr(other, 'tz') is not None: raise TypeError("cannot coerce a Timestamp with a tz on a " "naive Block") - other_mask = isna(other) other = other.asm8.view('i8') elif hasattr(other, 'dtype') and is_datetime64_dtype(other): - other_mask = isna(other) other = other.astype('i8', copy=False).view('i8') else: # coercion issues # let higher levels handle raise TypeError - return values, values_mask, other, other_mask + return values, other def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -2877,7 +2773,8 @@ def to_native_types(self, slicer=None, na_rep=None, date_format=None, def should_store(self, value): return (issubclass(value.dtype.type, np.datetime64) and - not is_datetimetz(value)) + not is_datetime64tz_dtype(value) and + not is_extension_array_dtype(value)) def set(self, locs, values, check=False): """ @@ -2887,9 +2784,7 @@ def set(self, locs, values, check=False): ------- None """ - if values.dtype != _NS_DTYPE: - # Workaround for numpy 1.6 bug - values = conversion.ensure_datetime64ns(values) + values = conversion.ensure_datetime64ns(values, copy=False) self.values[locs] = values @@ -2947,7 +2842,7 @@ def is_view(self): # check the ndarray values of the DatetimeIndex values return self.values.values.base is not None - def copy(self, deep=True, mgr=None): + def copy(self, deep=True): """ copy constructor """ values = self.values if deep: @@ -2987,9 +2882,8 @@ def _try_coerce_args(self, values, other): Returns ------- - base-type values, values mask, base-type other, other mask + base-type values, base-type other """ - values_mask = _block_shape(isna(values), ndim=self.ndim) # asi8 is a view, needs copy values = _block_shape(values.asi8, ndim=self.ndim) @@ -3001,11 +2895,9 @@ def _try_coerce_args(self, values, other): elif (is_null_datelike_scalar(other) or (lib.is_scalar(other) and isna(other))): other = tslibs.iNaT - other_mask = True elif isinstance(other, self._holder): if other.tz != self.values.tz: raise ValueError("incompatible or non tz-aware value") - other_mask = _block_shape(isna(other), ndim=self.ndim) other = _block_shape(other.asi8, ndim=self.ndim) elif isinstance(other, (np.datetime64, datetime, date)): other = tslibs.Timestamp(other) @@ -3014,12 +2906,11 @@ def _try_coerce_args(self, values, other): # test we can have an equal time zone if tz is None or str(tz) != str(self.values.tz): raise ValueError("incompatible or non tz-aware value") - other_mask = isna(other) other = other.value else: raise TypeError - return values, values_mask, other, other_mask + return values, other def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -3032,7 +2923,9 @@ def _try_coerce_result(self, result): # allow passing of > 1dim if its trivial if result.ndim > 1: result = result.reshape(np.prod(result.shape)) - result = self.values._shallow_copy(result) + + # GH#24096 new values invalidates a frequency + result = self.values._shallow_copy(result, freq=None) return result @@ -3040,7 +2933,7 @@ def _try_coerce_result(self, result): def _box_func(self): return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) - def shift(self, periods, axis=0, mgr=None): + def shift(self, periods, axis=0): """ shift the block by periods """ # think about moving this to the DatetimeIndex. This is a non-freq @@ -3064,14 +2957,13 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] - def diff(self, n, axis=0, mgr=None): + def diff(self, n, axis=0): """1st discrete difference Parameters ---------- n : int, number of periods to diff axis : int, axis to diff upon. default 0 - mgr : default None Return ------ @@ -3104,164 +2996,6 @@ def concat_same_type(self, to_concat, placement=None): values, placement=placement or slice(0, len(values), 1)) -class SparseBlock(NonConsolidatableMixIn, Block): - """ implement as a list of sparse arrays of the same dtype """ - __slots__ = () - is_sparse = True - is_numeric = True - _box_to_block_values = False - _can_hold_na = True - _ftype = 'sparse' - _concatenator = staticmethod(_concat._concat_sparse) - - def __init__(self, values, placement, ndim=None): - # Ensure that we have the underlying SparseArray here... - if isinstance(values, ABCSeries): - values = values.values - assert isinstance(values, SparseArray) - super(SparseBlock, self).__init__(values, placement, ndim=ndim) - - @property - def _holder(self): - return SparseArray - - @property - def shape(self): - return (len(self.mgr_locs), self.sp_index.length) - - @property - def fill_value(self): - # return np.nan - return self.values.fill_value - - @fill_value.setter - def fill_value(self, v): - self.values.fill_value = v - - def to_dense(self): - return self.values.to_dense().view() - - @property - def sp_values(self): - return self.values.sp_values - - @sp_values.setter - def sp_values(self, v): - # reset the sparse values - self.values = SparseArray(v, sparse_index=self.sp_index, - kind=self.kind, dtype=v.dtype, - fill_value=self.values.fill_value, - copy=False) - - @property - def sp_index(self): - return self.values.sp_index - - @property - def kind(self): - return self.values.kind - - def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None, **kwargs): - if values is None: - values = self.values - values = values.astype(dtype, copy=copy) - return self.make_block_same_class(values=values, - placement=self.mgr_locs) - - def __len__(self): - try: - return self.sp_index.length - except AttributeError: - return 0 - - def copy(self, deep=True, mgr=None): - return self.make_block_same_class(values=self.values, - sparse_index=self.sp_index, - kind=self.kind, copy=deep, - placement=self.mgr_locs) - - def make_block_same_class(self, values, placement, sparse_index=None, - kind=None, dtype=None, fill_value=None, - copy=False, ndim=None): - """ return a new block """ - if dtype is None: - dtype = values.dtype - if fill_value is None and not isinstance(values, SparseArray): - fill_value = self.values.fill_value - - # if not isinstance(values, SparseArray) and values.ndim != self.ndim: - # raise ValueError("ndim mismatch") - - if values.ndim == 2: - nitems = values.shape[0] - - if nitems == 0: - # kludgy, but SparseBlocks cannot handle slices, where the - # output is 0-item, so let's convert it to a dense block: it - # won't take space since there's 0 items, plus it will preserve - # the dtype. - return self.make_block(np.empty(values.shape, dtype=dtype), - placement) - elif nitems > 1: - raise ValueError("Only 1-item 2d sparse blocks are supported") - else: - values = values.reshape(values.shape[1]) - - new_values = SparseArray(values, sparse_index=sparse_index, - kind=kind or self.kind, dtype=dtype, - fill_value=fill_value, copy=copy) - return self.make_block(new_values, - placement=placement) - - def interpolate(self, method='pad', axis=0, inplace=False, limit=None, - fill_value=None, **kwargs): - - values = missing.interpolate_2d(self.values.to_dense(), method, axis, - limit, fill_value) - return self.make_block_same_class(values=values, - placement=self.mgr_locs) - - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): - # we may need to upcast our fill to match our dtype - if limit is not None: - raise NotImplementedError("specifying a limit for 'fillna' has " - "not been implemented yet") - values = self.values if inplace else self.values.copy() - values = values.fillna(value, downcast=downcast) - return [self.make_block_same_class(values=values, - placement=self.mgr_locs)] - - def shift(self, periods, axis=0, mgr=None): - """ shift the block by periods """ - N = len(self.values.T) - indexer = np.zeros(N, dtype=int) - if periods > 0: - indexer[periods:] = np.arange(N - periods) - else: - indexer[:periods] = np.arange(-periods, N) - new_values = self.values.to_dense().take(indexer) - # convert integer to float if necessary. need to do a lot more than - # that, handle boolean etc also - new_values, fill_value = maybe_upcast(new_values) - if periods > 0: - new_values[:periods] = fill_value - else: - new_values[periods:] = fill_value - return [self.make_block_same_class(new_values, - placement=self.mgr_locs)] - - def sparse_reindex(self, new_index): - """ sparse reindex and return a new block - current reindex only works for float64 dtype! """ - values = self.values - values = values.sp_index.to_int_index().reindex( - values.sp_values.astype('float64'), values.fill_value, new_index) - return self.make_block_same_class(values, sparse_index=new_index, - placement=self.mgr_locs) - - # ----------------------------------------------------------------- # Constructor Helpers @@ -3281,8 +3015,10 @@ def get_block_type(values, dtype=None): dtype = dtype or values.dtype vtype = dtype.type - if is_sparse(values): - cls = SparseBlock + if is_categorical(values): + cls = CategoricalBlock + elif is_extension_array_dtype(values): + cls = ExtensionBlock elif issubclass(vtype, np.floating): cls = FloatBlock elif issubclass(vtype, np.timedelta64): @@ -3290,14 +3026,10 @@ def get_block_type(values, dtype=None): cls = TimeDeltaBlock elif issubclass(vtype, np.complexfloating): cls = ComplexBlock - elif is_categorical(values): - cls = CategoricalBlock - elif is_extension_array_dtype(values): - cls = ExtensionBlock elif issubclass(vtype, np.datetime64): - assert not is_datetimetz(values) + assert not is_datetime64tz_dtype(values) cls = DatetimeBlock - elif is_datetimetz(values): + elif is_datetime64tz_dtype(values): cls = DatetimeTZBlock elif issubclass(vtype, np.integer): cls = IntBlock @@ -3318,7 +3050,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, dtype = dtype or values.dtype klass = get_block_type(values, dtype) - elif klass is DatetimeTZBlock and not is_datetimetz(values): + elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values): return klass(values, ndim=ndim, placement=placement, dtype=dtype) @@ -3350,7 +3082,11 @@ def _block_shape(values, ndim=1, shape=None): if values.ndim < ndim: if shape is None: shape = values.shape - values = values.reshape(tuple((1, ) + shape)) + if not is_extension_array_dtype(values): + # TODO: https://github.com/pandas-dev/pandas/issues/23023 + # block.shape is incorrect for "2D" ExtensionArrays + # We can't, and don't need to, reshape. + values = values.reshape(tuple((1, ) + shape)) return values @@ -3369,7 +3105,7 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): # FIXME: optimization potential in case all mgrs contain slices and # combination of those slices is a slice, too. new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - new_values = _vstack([b.values for b in blocks], dtype) + new_values = np.vstack([b.values for b in blocks]) argsort = np.argsort(new_mgr_locs) new_values = new_values[argsort] @@ -3381,17 +3117,6 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): return blocks -def _vstack(to_stack, dtype): - - # work around NumPy 1.6 bug - if dtype == _NS_DTYPE or dtype == _TD_DTYPE: - new_values = np.vstack([x.view('i8') for x in to_stack]) - return new_values.view(dtype) - - else: - return np.vstack(to_stack) - - def _block2d_to_blocknd(values, placement, shape, labels, ref_items): """ pivot to the labels shape """ panel_shape = (len(placement),) + shape diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 5a3f11525acf8..2441c64518d59 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,22 +1,20 @@ # -*- coding: utf-8 -*- # TODO: Needs a better name; too many modules are already called "concat" -import copy from collections import defaultdict +import copy import numpy as np -from pandas._libs import tslibs, internals as libinternals +from pandas._libs import internals as libinternals, tslibs from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.common import ( - is_timedelta64_dtype, - is_datetime64_dtype, is_datetimetz, - is_categorical_dtype, - is_float_dtype, is_numeric_dtype, - _get_dtype) from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + _get_dtype, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_float_dtype, is_numeric_dtype, is_sparse, + is_timedelta64_dtype) import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos @@ -150,11 +148,8 @@ def is_na(self): values = self.block.values if self.block.is_categorical: values_flat = values.categories - elif self.block.is_sparse: - # fill_value is not NaN and have holes - if not values._null_fill_value and values.sp_index.ngaps > 0: - return False - values_flat = values.ravel(order='K') + elif is_sparse(self.block.values.dtype): + return False elif self.block.is_extension: values_flat = values else: @@ -184,8 +179,12 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): if len(values) and values[0] is None: fill_value = None - if getattr(self.block, 'is_datetimetz', False) or \ - is_datetimetz(empty_dtype): + if (getattr(self.block, 'is_datetimetz', False) or + is_datetime64tz_dtype(empty_dtype)): + if self.block is None: + array = empty_dtype.construct_array_type() + missing_arr = array([fill_value], dtype=empty_dtype) + return missing_arr.repeat(self.shape[1]) pass elif getattr(self.block, 'is_categorical', False): pass @@ -268,7 +267,6 @@ def get_empty_dtype_and_na(join_units): dtype na """ - if len(join_units) == 1: blk = join_units[0].block if blk is None: @@ -296,7 +294,7 @@ def get_empty_dtype_and_na(join_units): if is_categorical_dtype(dtype): upcast_cls = 'category' - elif is_datetimetz(dtype): + elif is_datetime64tz_dtype(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' @@ -306,6 +304,8 @@ def get_empty_dtype_and_na(join_units): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' + elif is_sparse(dtype): + upcast_cls = dtype.subtype.name elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: @@ -340,14 +340,19 @@ def get_empty_dtype_and_na(join_units): elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma - g = np.find_common_type(upcast_classes, []) - if is_float_dtype(g): - return g, g.type(np.nan) - elif is_numeric_dtype(g): - if has_none_blocks: - return np.float64, np.nan - else: - return g, None + try: + g = np.find_common_type(upcast_classes, []) + except TypeError: + # At least one is an ExtensionArray + return np.dtype(np.object_), np.nan + else: + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): + if has_none_blocks: + return np.float64, np.nan + else: + return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py new file mode 100644 index 0000000000000..c437456794f43 --- /dev/null +++ b/pandas/core/internals/construction.py @@ -0,0 +1,708 @@ +""" +Functions for preparing various inputs passed to the DataFrame or Series +constructors before passing them to a BlockManager. +""" +from collections import OrderedDict + +import numpy as np +import numpy.ma as ma + +from pandas._libs import lib +from pandas._libs.tslibs import IncompatibleFrequency +import pandas.compat as compat +from pandas.compat import ( + get_range_parameters, lmap, lrange, raise_with_traceback, range) + +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, + construct_1d_object_array_from_listlike, infer_dtype_from_scalar, + maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable, + maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast) +from pandas.core.dtypes.common import ( + is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal, + is_extension_array_dtype, is_extension_type, is_float_dtype, + is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries, + ABCTimedeltaIndex) +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms, common as com +from pandas.core.arrays import Categorical, ExtensionArray, period_array +from pandas.core.index import ( + Index, _get_objs_combined_axis, _union_indexes, ensure_index) +from pandas.core.indexes import base as ibase +from pandas.core.internals import ( + create_block_manager_from_arrays, create_block_manager_from_blocks) + +# --------------------------------------------------------------------- +# BlockManager Interface + + +def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + + Needs to handle a lot of exceptional cases. + """ + # figure out the index, if necessary + if index is None: + index = extract_index(arrays) + else: + index = ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + arrays = _homogenize(arrays, index, dtype) + + # from BlockManager perspective + axes = [ensure_index(columns), index] + + return create_block_manager_from_arrays(arrays, arr_names, axes) + + +def masked_rec_array_to_mgr(data, index, columns, dtype, copy): + """ + Extract from a masked rec array and create the manager. + """ + + # essentially process a record array then fill it + fill_value = data.fill_value + fdata = ma.getdata(data) + if index is None: + index = get_names_from_index(fdata) + if index is None: + index = ibase.default_index(len(data)) + index = ensure_index(index) + + if columns is not None: + columns = ensure_index(columns) + arrays, arr_columns = to_arrays(fdata, columns) + + # fill if needed + new_arrays = [] + for fv, arr, col in zip(fill_value, arrays, arr_columns): + mask = ma.getmaskarray(data[col]) + if mask.any(): + arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) + arr[mask] = fv + new_arrays.append(arr) + + # create the manager + arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) + if columns is None: + columns = arr_columns + + mgr = arrays_to_mgr(arrays, arr_columns, index, columns) + + if copy: + mgr = mgr.copy() + return mgr + + +# --------------------------------------------------------------------- +# DataFrame Constructor Interface + +def init_ndarray(values, index, columns, dtype=None, copy=False): + # input must be a ndarray, list, Series, index + + if isinstance(values, ABCSeries): + if columns is None: + if values.name is not None: + columns = [values.name] + if index is None: + index = values.index + else: + values = values.reindex(index) + + # zero len case (GH #2234) + if not len(values) and columns is not None and len(columns): + values = np.empty((0, 1), dtype=object) + + # we could have a categorical type passed or coerced to 'category' + # recast this to an arrays_to_mgr + if (is_categorical_dtype(getattr(values, 'dtype', None)) or + is_categorical_dtype(dtype)): + + if not hasattr(values, 'dtype'): + values = prep_ndarray(values, copy=copy) + values = values.ravel() + elif copy: + values = values.copy() + + index, columns = _get_axes(len(values), 1, index, columns) + return arrays_to_mgr([values], columns, index, columns, + dtype=dtype) + elif (is_datetime64tz_dtype(values) or + is_extension_array_dtype(values)): + # GH#19157 + if columns is None: + columns = [0] + return arrays_to_mgr([values], columns, index, columns, + dtype=dtype) + + # by definition an array here + # the dtypes will be coerced to a single dtype + values = prep_ndarray(values, copy=copy) + + if dtype is not None: + if not is_dtype_equal(values.dtype, dtype): + try: + values = values.astype(dtype) + except Exception as orig: + e = ValueError("failed to cast to '{dtype}' (Exception " + "was: {orig})".format(dtype=dtype, + orig=orig)) + raise_with_traceback(e) + + index, columns = _get_axes(*values.shape, index=index, columns=columns) + values = values.T + + # if we don't have a dtype specified, then try to convert objects + # on the entire block; this is to convert if we have datetimelike's + # embedded in an object type + if dtype is None and is_object_dtype(values): + values = maybe_infer_to_datetimelike(values) + + return create_block_manager_from_blocks([values], [columns, index]) + + +def init_dict(data, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. + """ + if columns is not None: + from pandas.core.series import Series + arrays = Series(data, index=columns, dtype=object) + data_names = arrays.index + + missing = arrays.isnull() + if index is None: + # GH10856 + # raise ValueError if only scalars in dict + index = extract_index(arrays[~missing]) + else: + index = ensure_index(index) + + # no obvious "empty" int column + if missing.any() and not is_integer_dtype(dtype): + if dtype is None or np.issubdtype(dtype, np.flexible): + # GH#1783 + nan_dtype = object + else: + nan_dtype = dtype + val = construct_1d_arraylike_from_scalar(np.nan, len(index), + nan_dtype) + arrays.loc[missing] = [val] * missing.sum() + + else: + + for key in data: + if (isinstance(data[key], ABCDatetimeIndex) and + data[key].tz is not None): + # GH#24096 need copy to be deep for datetime64tz case + # TODO: See if we can avoid these copies + data[key] = data[key].copy(deep=True) + + keys = com.dict_keys_to_ordered_list(data) + columns = data_names = Index(keys) + arrays = [data[k] for k in keys] + + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + + +# --------------------------------------------------------------------- + +def prep_ndarray(values, copy=True): + if not isinstance(values, (np.ndarray, ABCSeries, Index)): + if len(values) == 0: + return np.empty((0, 0), dtype=object) + + def convert(v): + return maybe_convert_platform(v) + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + try: + if is_list_like(values[0]) or hasattr(values[0], 'len'): + values = np.array([convert(v) for v in values]) + elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: + # GH#21861 + values = np.array([convert(v) for v in values]) + else: + values = convert(values) + except (ValueError, TypeError): + values = convert(values) + + else: + + # drop subclass info, do not copy data + values = np.asarray(values) + if copy: + values = values.copy() + + if values.ndim == 1: + values = values.reshape((values.shape[0], 1)) + elif values.ndim != 2: + raise ValueError('Must pass 2-d input') + + return values + + +def _homogenize(data, index, dtype=None): + oindex = None + homogenized = [] + + for val in data: + if isinstance(val, ABCSeries): + if dtype is not None: + val = val.astype(dtype) + if val.index is not index: + # Forces alignment. No need to copy data since we + # are putting it into an ndarray later + val = val.reindex(index, copy=False) + else: + if isinstance(val, dict): + if oindex is None: + oindex = index.astype('O') + + if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): + val = com.dict_compat(val) + else: + val = dict(val) + val = lib.fast_multiget(val, oindex.values, default=np.nan) + val = sanitize_array(val, index, dtype=dtype, copy=False, + raise_cast_failure=False) + + homogenized.append(val) + + return homogenized + + +def extract_index(data): + index = None + if len(data) == 0: + index = Index([]) + elif len(data) > 0: + raw_lengths = [] + indexes = [] + + have_raw_arrays = False + have_series = False + have_dicts = False + + for val in data: + if isinstance(val, ABCSeries): + have_series = True + indexes.append(val.index) + elif isinstance(val, dict): + have_dicts = True + indexes.append(list(val.keys())) + elif is_list_like(val) and getattr(val, 'ndim', 1) == 1: + have_raw_arrays = True + raw_lengths.append(len(val)) + + if not indexes and not raw_lengths: + raise ValueError('If using all scalar values, you must pass' + ' an index') + + if have_series or have_dicts: + index = _union_indexes(indexes) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError('arrays must all be same length') + + if have_dicts: + raise ValueError('Mixing dicts with non-Series may lead to ' + 'ambiguous ordering.') + + if have_series: + if lengths[0] != len(index): + msg = ('array length {length} does not match index ' + 'length {idx_len}' + .format(length=lengths[0], idx_len=len(index))) + raise ValueError(msg) + else: + index = ibase.default_index(lengths[0]) + + return ensure_index(index) + + +def reorder_arrays(arrays, arr_columns, columns): + # reorder according to the columns + if (columns is not None and len(columns) and arr_columns is not None and + len(arr_columns)): + indexer = ensure_index(arr_columns).get_indexer(columns) + arr_columns = ensure_index([arr_columns[i] for i in indexer]) + arrays = [arrays[i] for i in indexer] + return arrays, arr_columns + + +def get_names_from_index(data): + has_some_name = any(getattr(s, 'name', None) is not None for s in data) + if not has_some_name: + return ibase.default_index(len(data)) + + index = lrange(len(data)) + count = 0 + for i, s in enumerate(data): + n = getattr(s, 'name', None) + if n is not None: + index[i] = n + else: + index[i] = 'Unnamed {count}'.format(count=count) + count += 1 + + return index + + +def _get_axes(N, K, index, columns): + # helper to create the axes as indexes + # return axes or defaults + + if index is None: + index = ibase.default_index(N) + else: + index = ensure_index(index) + + if columns is None: + columns = ibase.default_index(K) + else: + columns = ensure_index(columns) + return index, columns + + +# --------------------------------------------------------------------- +# Conversion of Inputs to Arrays + +def to_arrays(data, columns, coerce_float=False, dtype=None): + """ + Return list of arrays, columns. + """ + if isinstance(data, ABCDataFrame): + if columns is not None: + arrays = [data._ixs(i, axis=1).values + for i, col in enumerate(data.columns) if col in columns] + else: + columns = data.columns + arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] + + return arrays, columns + + if not len(data): + if isinstance(data, np.ndarray): + columns = data.dtype.names + if columns is not None: + return [[]] * len(columns), columns + return [], [] # columns if columns is not None else [] + if isinstance(data[0], (list, tuple)): + return _list_to_arrays(data, columns, coerce_float=coerce_float, + dtype=dtype) + elif isinstance(data[0], compat.Mapping): + return _list_of_dict_to_arrays(data, columns, + coerce_float=coerce_float, dtype=dtype) + elif isinstance(data[0], ABCSeries): + return _list_of_series_to_arrays(data, columns, + coerce_float=coerce_float, + dtype=dtype) + elif isinstance(data[0], Categorical): + if columns is None: + columns = ibase.default_index(len(data)) + return data, columns + elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and + data.dtype.names is not None): + + columns = list(data.dtype.names) + arrays = [data[k] for k in columns] + return arrays, columns + else: + # last ditch effort + data = lmap(tuple, data) + return _list_to_arrays(data, columns, coerce_float=coerce_float, + dtype=dtype) + + +def _list_to_arrays(data, columns, coerce_float=False, dtype=None): + if len(data) > 0 and isinstance(data[0], tuple): + content = list(lib.to_object_array_tuples(data).T) + else: + # list of lists + content = list(lib.to_object_array(data).T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + + +def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): + if columns is None: + columns = _get_objs_combined_axis(data, sort=False) + + indexer_cache = {} + + aligned_values = [] + for s in data: + index = getattr(s, 'index', None) + if index is None: + index = ibase.default_index(len(s)) + + if id(index) in indexer_cache: + indexer = indexer_cache[id(index)] + else: + indexer = indexer_cache[id(index)] = index.get_indexer(columns) + + values = com.values_from_object(s) + aligned_values.append(algorithms.take_1d(values, indexer)) + + values = np.vstack(aligned_values) + + if values.dtype == np.object_: + content = list(values.T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + else: + return values.T, columns + + +def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): + if columns is None: + gen = (list(x.keys()) for x in data) + sort = not any(isinstance(d, OrderedDict) for d in data) + columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) + + # assure that they are of the base dict class and not of derived + # classes + data = [(type(d) is dict) and d or dict(d) for d in data] + + content = list(lib.dicts_to_array(data, list(columns)).T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + + +def _convert_object_array(content, columns, coerce_float=False, dtype=None): + if columns is None: + columns = ibase.default_index(len(content)) + else: + if len(columns) != len(content): # pragma: no cover + # caller's responsibility to check for this... + raise AssertionError('{col:d} columns passed, passed data had ' + '{con} columns'.format(col=len(columns), + con=len(content))) + + # provide soft conversion of object dtypes + def convert(arr): + if dtype != object and dtype != np.object: + arr = lib.maybe_convert_objects(arr, try_float=coerce_float) + arr = maybe_cast_to_datetime(arr, dtype) + return arr + + arrays = [convert(arr) for arr in content] + + return arrays, columns + + +# --------------------------------------------------------------------- +# Series-Based + +def sanitize_index(data, index, copy=False): + """ + Sanitize an index type to return an ndarray of the underlying, pass + through a non-Index. + """ + + if index is None: + return data + + if len(data) != len(index): + raise ValueError('Length of values does not match length of index') + + if isinstance(data, ABCIndexClass) and not copy: + pass + elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)): + data = data._values + if copy: + data = data.copy() + + elif isinstance(data, np.ndarray): + + # coerce datetimelike types + if data.dtype.kind in ['M', 'm']: + data = sanitize_array(data, index, copy=copy) + + return data + + +def sanitize_array(data, index, dtype=None, copy=False, + raise_cast_failure=False): + """ + Sanitize input data to an ndarray, copy if specified, coerce to the + dtype if specified. + """ + + if dtype is not None: + dtype = pandas_dtype(dtype) + + if isinstance(data, ma.MaskedArray): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data[mask] = fill_value + else: + data = data.copy() + + # GH#846 + if isinstance(data, (np.ndarray, Index, ABCSeries)): + + if dtype is not None: + subarr = np.array(data, copy=False) + + # possibility of nan -> garbage + if is_float_dtype(data.dtype) and is_integer_dtype(dtype): + if not isna(data).any(): + subarr = _try_cast(data, True, dtype, copy, + raise_cast_failure) + elif copy: + subarr = data.copy() + else: + subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) + elif isinstance(data, Index): + # don't coerce Index types + # e.g. indexes can have different conversions (so don't fast path + # them) + # GH#6140 + subarr = sanitize_index(data, index, copy=copy) + else: + + # we will try to copy be-definition here + subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) + + elif isinstance(data, ExtensionArray): + subarr = data + + if dtype is not None and not data.dtype.is_dtype(dtype): + subarr = data.astype(dtype) + + if copy: + subarr = data.copy() + return subarr + + elif isinstance(data, (list, tuple)) and len(data) > 0: + if dtype is not None: + try: + subarr = _try_cast(data, False, dtype, copy, + raise_cast_failure) + except Exception: + if raise_cast_failure: # pragma: no cover + raise + subarr = np.array(data, dtype=object, copy=copy) + subarr = lib.maybe_convert_objects(subarr) + + else: + subarr = maybe_convert_platform(data) + + subarr = maybe_cast_to_datetime(subarr, dtype) + + elif isinstance(data, range): + # GH#16804 + start, stop, step = get_range_parameters(data) + arr = np.arange(start, stop, step, dtype='int64') + subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) + else: + subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) + + # scalar like, GH + if getattr(subarr, 'ndim', 0) == 0: + if isinstance(data, list): # pragma: no cover + subarr = np.array(data, dtype=object) + elif index is not None: + value = data + + # figure out the dtype from the value (upcast if necessary) + if dtype is None: + dtype, value = infer_dtype_from_scalar(value) + else: + # need to possibly convert the value here + value = maybe_cast_to_datetime(value, dtype) + + subarr = construct_1d_arraylike_from_scalar( + value, len(index), dtype) + + else: + return subarr.item() + + # the result that we want + elif subarr.ndim == 1: + if index is not None: + + # a 1-element ndarray + if len(subarr) != len(index) and len(subarr) == 1: + subarr = construct_1d_arraylike_from_scalar( + subarr[0], len(index), subarr.dtype) + + elif subarr.ndim > 1: + if isinstance(data, np.ndarray): + raise Exception('Data must be 1-dimensional') + else: + subarr = com.asarray_tuplesafe(data, dtype=dtype) + + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(subarr.dtype.type, compat.string_types): + # GH#16605 + # If not empty convert the data to dtype + # GH#19853: If data is a scalar, subarr has already the result + if not lib.is_scalar(data): + if not np.all(isna(data)): + data = np.array(data, dtype=dtype, copy=False) + subarr = np.array(data, dtype=object, copy=copy) + + if is_object_dtype(subarr.dtype) and dtype != 'object': + inferred = lib.infer_dtype(subarr) + if inferred == 'period': + try: + subarr = period_array(subarr) + except IncompatibleFrequency: + pass + + return subarr + + +def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): + + # perf shortcut as this is the most common case + if take_fast_path: + if maybe_castable(arr) and not copy and dtype is None: + return arr + + try: + # GH#15832: Check if we are requesting a numeric dype and + # that we can convert the data to the requested dtype. + if is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(arr, dtype) + + subarr = maybe_cast_to_datetime(arr, dtype) + # Take care in creating object arrays (but iterators are not + # supported): + if is_object_dtype(dtype) and (is_list_like(subarr) and + not (is_iterator(subarr) or + isinstance(subarr, np.ndarray))): + subarr = construct_1d_object_array_from_listlike(subarr) + elif not is_extension_type(subarr): + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, + copy=copy) + except (ValueError, TypeError): + if is_categorical_dtype(dtype): + # We *do* allow casting to categorical, since we know + # that Categorical is the only array type for 'category'. + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) + elif is_extension_array_dtype(dtype): + # create an extension array from its dtype + array_type = dtype.construct_array_type()._from_sequence + subarr = array_type(arr, dtype=dtype, copy=copy) + elif dtype is not None and raise_cast_failure: + raise + else: + subarr = np.array(arr, dtype=object, copy=copy) + return subarr diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2f29f1ae2509f..5f9860ce98b11 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -7,42 +7,34 @@ import numpy as np -from pandas._libs import lib, internals as libinternals - +from pandas._libs import internals as libinternals, lib +from pandas.compat import map, range, zip from pandas.util._validators import validate_bool_kwarg -from pandas.compat import range, map, zip -from pandas.core.dtypes.common import ( - _NS_DTYPE, - is_datetimelike_v_numeric, - is_numeric_v_string_like, is_extension_type, - is_extension_array_dtype, - is_scalar) from pandas.core.dtypes.cast import ( - maybe_promote, - infer_dtype_from_scalar, - find_common_type, - maybe_convert_objects) -from pandas.core.dtypes.missing import isna + find_common_type, infer_dtype_from_scalar, maybe_convert_objects, + maybe_promote) +from pandas.core.dtypes.common import ( + _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype, + is_extension_type, is_numeric_v_string_like, is_scalar) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ABCSeries, ABCExtensionArray +from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.missing import isna -from pandas.core.base import PandasObject import pandas.core.algorithms as algos -from pandas.core.sparse.array import _maybe_to_sparse - +from pandas.core.arrays.sparse import _maybe_to_sparse +from pandas.core.base import PandasObject from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.indexing import maybe_convert_indices from pandas.io.formats.printing import pprint_thing from .blocks import ( - Block, DatetimeTZBlock, CategoricalBlock, ExtensionBlock, SparseBlock, - _extend_blocks, _merge_blocks, _safe_reshape, - make_block, get_block_type) + Block, CategoricalBlock, DatetimeTZBlock, ExtensionBlock, _extend_blocks, + _merge_blocks, _safe_reshape, get_block_type, make_block) from .concat import ( # all for concatenate_block_managers - concatenate_join_units, is_uniform_join_units, - get_mgr_concatenation_plan, combine_concat_plans) + combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan, + is_uniform_join_units) # TODO: flexible with index=None and/or items=None @@ -256,9 +248,6 @@ def __getstate__(self): def __setstate__(self, state): def unpickle_block(values, mgr_locs): - # numpy < 1.7 pickle compat - if values.dtype == 'M8[us]': - values = values.astype('M8[ns]') return make_block(values, placement=mgr_locs) if (isinstance(state, tuple) and len(state) >= 4 and @@ -373,9 +362,6 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, align_keys = ['new', 'mask'] else: align_keys = ['mask'] - elif f == 'eval': - align_copy = False - align_keys = ['other'] elif f == 'fillna': # fillna internally does putmask, maybe it's better to do this # at mgr, not block level? @@ -405,7 +391,6 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) - kwargs['mgr'] = self applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) @@ -443,8 +428,7 @@ def reduction(self, f, axis=0, consolidate=True, transposed=False, axes, blocks = [], [] for b in self.blocks: - kwargs['mgr'] = self - axe, block = getattr(b, f)(axis=axis, **kwargs) + axe, block = getattr(b, f)(axis=axis, axes=self.axes, **kwargs) axes.append(axe) blocks.append(block) @@ -511,9 +495,6 @@ def isna(self, func, **kwargs): def where(self, **kwargs): return self.apply('where', **kwargs) - def eval(self, **kwargs): - return self.apply('eval', **kwargs) - def quantile(self, **kwargs): return self.reduction('quantile', **kwargs) @@ -547,15 +528,11 @@ def convert(self, **kwargs): def replace(self, **kwargs): return self.apply('replace', **kwargs) - def replace_list(self, src_list, dest_list, inplace=False, regex=False, - mgr=None): + def replace_list(self, src_list, dest_list, inplace=False, regex=False): """ do a list replace """ inplace = validate_bool_kwarg(inplace, 'inplace') - if mgr is None: - mgr = self - # figure out our mask a-priori to avoid repeated replacements values = self.as_array() @@ -587,8 +564,7 @@ def comp(s, regex=False): convert = i == src_len result = b._replace_coerce(mask=m, to_replace=s, value=d, inplace=inplace, - convert=convert, regex=regex, - mgr=mgr) + convert=convert, regex=regex) if m.any(): new_rb = _extend_blocks(result, new_rb) else: @@ -723,7 +699,7 @@ def __contains__(self, item): def nblocks(self): return len(self.blocks) - def copy(self, deep=True, mgr=None): + def copy(self, deep=True): """ Make deep or shallow copy of BlockManager @@ -737,7 +713,6 @@ def copy(self, deep=True, mgr=None): ------- copy : BlockManager """ - # this preserves the notion of view copying of axes if deep: if deep == 'all': @@ -786,27 +761,18 @@ def _interleave(self): Return ndarray from blocks with specified item order Items must be contained in the blocks """ + from pandas.core.dtypes.common import is_sparse dtype = _interleaved_dtype(self.blocks) - if is_extension_array_dtype(dtype): - # TODO: https://github.com/pandas-dev/pandas/issues/22791 - # Give EAs some input on what happens here. Sparse needs this. + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if is_sparse(dtype): + dtype = dtype.subtype + elif is_extension_array_dtype(dtype): dtype = 'object' result = np.empty(self.shape, dtype=dtype) - if result.shape[0] == 0: - # Workaround for numpy 1.7 bug: - # - # >>> a = np.empty((0,10)) - # >>> a[slice(0,0)] - # array([], shape=(0, 10), dtype=float64) - # >>> a[[]] - # Traceback (most recent call last): - # File "", line 1, in - # IndexError: index 0 is out of bounds for axis 0 with size 0 - return result - itemmask = np.zeros(self.shape[0]) for blk in self.blocks: @@ -1189,8 +1155,7 @@ def insert(self, loc, item, value, allow_duplicates=False): blk.mgr_locs = new_mgr_locs if loc == self._blklocs.shape[0]: - # np.append is a lot faster (at least in numpy 1.7.1), let's use it - # if we can. + # np.append is a lot faster, let's use it if we can. self._blklocs = np.append(self._blklocs, 0) self._blknos = np.append(self._blknos, len(self.blocks)) else: @@ -1416,18 +1381,21 @@ def canonicalize(block): return all(block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks)) - def unstack(self, unstacker_func): + def unstack(self, unstacker_func, fill_value): """Return a blockmanager with all blocks unstacked. Parameters ---------- unstacker_func : callable A (partially-applied) ``pd.core.reshape._Unstacker`` class. + fill_value : Any + fill_value for newly introduced missing values. Returns ------- unstacked : BlockManager """ + n_rows = self.shape[-1] dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) new_columns = dummy.get_new_columns() new_index = dummy.get_new_index() @@ -1438,7 +1406,10 @@ def unstack(self, unstacker_func): blocks, mask = blk._unstack( partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]), - new_columns) + new_columns, + n_rows, + fill_value + ) new_blocks.extend(blocks) columns_mask.extend(mask) @@ -1634,8 +1605,7 @@ def concat(self, to_concat, new_axis): # check if all series are of the same block type: if len(non_empties) > 0: blocks = [obj.blocks[0] for obj in non_empties] - - if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa + if len({b.dtype for b in blocks}) == 1: new_block = blocks[0].concat_same_type(blocks) else: values = [x.values for x in blocks] @@ -1834,7 +1804,7 @@ def _sparse_blockify(tuples, dtype=None): new_blocks = [] for i, names, array in tuples: array = _maybe_to_sparse(array) - block = make_block(array, klass=SparseBlock, placement=[i]) + block = make_block(array, placement=[i]) new_blocks.append(block) return new_blocks @@ -2009,13 +1979,9 @@ def _transform_index(index, func, level=None): def _fast_count_smallints(arr): """Faster version of set(arr) for sequences of small numbers.""" - if len(arr) == 0: - # Handle empty arr case separately: numpy 1.6 chokes on that. - return np.empty((0, 2), dtype=arr.dtype) - else: - counts = np.bincount(arr.astype(np.int_)) - nz = counts.nonzero()[0] - return np.c_[nz, counts[nz]] + counts = np.bincount(arr.astype(np.int_)) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): @@ -2044,10 +2010,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): copy : bool """ - concat_plan = combine_concat_plans( - [get_mgr_concatenation_plan(mgr, indexers) - for mgr, indexers in mgrs_indexers], concat_axis) - + concat_plans = [get_mgr_concatenation_plan(mgr, indexers) + for mgr, indexers in mgrs_indexers] + concat_plan = combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index b2daec327d618..1012639fe0f9d 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,26 +1,19 @@ """ Routines for filling missing data """ +from distutils.version import LooseVersion import operator import numpy as np -from distutils.version import LooseVersion from pandas._libs import algos, lib - from pandas.compat import range, string_types -from pandas.core.dtypes.common import ( - is_numeric_v_string_like, - is_float_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_integer_dtype, - is_scalar, - is_integer, - needs_i8_conversion, - ensure_float64) from pandas.core.dtypes.cast import infer_dtype_from_array +from pandas.core.dtypes.common import ( + ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype, + is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar, + needs_i8_conversion) from pandas.core.dtypes.missing import isna @@ -760,9 +753,10 @@ def _interp_limit(invalid, fw_limit, bw_limit): .. code-block:: python - for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): - yield x + def _interp_limit(invalid, fw_limit, bw_limit): + for x in np.where(invalid)[0]: + if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + yield x """ # handle forward first; the backward direction is the same except # 1. operate on the reversed array diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 2884bc1a19491..027f458614bd8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1,27 +1,24 @@ +from distutils.version import LooseVersion import functools import itertools import operator import warnings -from distutils.version import LooseVersion import numpy as np -import pandas.core.common as com -from pandas import compat -from pandas._libs import tslibs, lib -from pandas.core.config import get_option +from pandas._libs import lib, tslibs +import pandas.compat as compat + from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( - _get_dtype, - is_float, is_scalar, - is_integer, is_complex, is_float_dtype, - is_complex_dtype, is_integer_dtype, - is_bool_dtype, is_object_dtype, - is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, - is_int_or_datetime_dtype, is_any_int_dtype) -from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype + _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype, + is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float, + is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype, + is_object_dtype, is_scalar, is_timedelta64_dtype) +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna + +import pandas.core.common as com +from pandas.core.config import get_option _BOTTLENECK_INSTALLED = False _MIN_BOTTLENECK_VERSION = '1.0.0' @@ -244,7 +241,7 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None, elif is_float_dtype(dtype): dtype_max = np.float64 - return values, mask, dtype, dtype_max + return values, mask, dtype, dtype_max, fill_value def _isfinite(values): @@ -257,7 +254,9 @@ def _isfinite(values): def _na_ok_dtype(dtype): - return not is_int_or_datetime_dtype(dtype) + # TODO: what about datetime64tz? PeriodDtype? + return not issubclass(dtype.type, + (np.integer, np.timedelta64, np.datetime64)) def _view_if_needed(values): @@ -266,16 +265,21 @@ def _view_if_needed(values): return values -def _wrap_results(result, dtype): +def _wrap_results(result, dtype, fill_value=None): """ wrap our results if needed """ if is_datetime64_dtype(dtype): if not isinstance(result, np.ndarray): + assert not isna(fill_value), "Expected non-null fill_value" + if result == fill_value: + result = np.nan result = tslibs.Timestamp(result) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): + if result == fill_value: + result = np.nan # raise if we have a timedelta64[ns] which is too large if np.fabs(result) > _int64_max: @@ -346,8 +350,8 @@ def nanany(values, axis=None, skipna=True, mask=None): >>> nanops.nanany(s) False """ - values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna, - mask=mask) + values, mask, dtype, _, _ = _get_values(values, skipna, False, copy=skipna, + mask=mask) return values.any(axis) @@ -379,8 +383,8 @@ def nanall(values, axis=None, skipna=True, mask=None): >>> nanops.nanall(s) False """ - values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna, - mask=mask) + values, mask, dtype, _, _ = _get_values(values, skipna, True, copy=skipna, + mask=mask) return values.all(axis) @@ -409,7 +413,8 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None): >>> nanops.nansum(s) 3.0 """ - values, mask, dtype, dtype_max = _get_values(values, skipna, 0, mask=mask) + values, mask, dtype, dtype_max, _ = _get_values(values, + skipna, 0, mask=mask) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype @@ -448,7 +453,8 @@ def nanmean(values, axis=None, skipna=True, mask=None): >>> nanops.nanmean(s) 1.5 """ - values, mask, dtype, dtype_max = _get_values(values, skipna, 0, mask=mask) + values, mask, dtype, dtype_max, _ = _get_values( + values, skipna, 0, mask=mask) dtype_sum = dtype_max dtype_count = np.float64 if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype): @@ -501,7 +507,7 @@ def get_median(x): return np.nan return np.nanmedian(x[mask]) - values, mask, dtype, dtype_max = _get_values(values, skipna, mask=mask) + values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -705,7 +711,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): @bottleneck_switch() def reduction(values, axis=None, skipna=True, mask=None): - values, mask, dtype, dtype_max = _get_values( + + values, mask, dtype, dtype_max, fill_value = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask) if ((axis is not None and values.shape[axis] == 0) or @@ -719,7 +726,7 @@ def reduction(values, axis=None, skipna=True, mask=None): else: result = getattr(values, meth)(axis) - result = _wrap_results(result, dtype) + result = _wrap_results(result, dtype, fill_value) return _maybe_null_out(result, axis, mask) reduction.__name__ = 'nan' + meth @@ -753,8 +760,8 @@ def nanargmax(values, axis=None, skipna=True, mask=None): >>> nanops.nanargmax(s) 4 """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf', - mask=mask) + values, mask, dtype, _, _ = _get_values( + values, skipna, fill_value_typ='-inf', mask=mask) result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @@ -783,8 +790,8 @@ def nanargmin(values, axis=None, skipna=True, mask=None): >>> nanops.nanargmin(s) 0 """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf', - mask=mask) + values, mask, dtype, _, _ = _get_values( + values, skipna, fill_value_typ='+inf', mask=mask) result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 20559bca9caed..bd5268808e7b2 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -5,50 +5,41 @@ """ # necessary to enforce truediv in Python 2.X from __future__ import division + import datetime import operator import textwrap import warnings import numpy as np -import pandas as pd - -from pandas._libs import lib, algos as libalgos, ops as libops - -from pandas import compat -from pandas.util._decorators import Appender +from pandas._libs import algos as libalgos, lib, ops as libops +import pandas.compat as compat from pandas.compat import bind_method -import pandas.core.missing as missing -import pandas.core.common as com - from pandas.errors import NullFrequencyError -from pandas.core.dtypes.missing import notna, isna -from pandas.core.dtypes.common import ( - needs_i8_conversion, - is_datetimelike_v_numeric, - is_period_dtype, - is_integer_dtype, is_categorical_dtype, - is_object_dtype, is_timedelta64_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, - is_list_like, - is_scalar, - is_extension_array_dtype, - ensure_object) +from pandas.util._decorators import Appender + from pandas.core.dtypes.cast import ( - maybe_upcast_putmask, find_common_type, - construct_1d_object_array_from_listlike) + construct_1d_object_array_from_listlike, find_common_type, + maybe_upcast_putmask) +from pandas.core.dtypes.common import ( + ensure_object, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, + is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype, + is_scalar, is_timedelta64_dtype, needs_i8_conversion) from pandas.core.dtypes.generic import ( - ABCSeries, - ABCDataFrame, ABCPanel, - ABCIndex, ABCIndexClass, - ABCSparseSeries, ABCSparseArray) + ABCDataFrame, ABCIndex, ABCIndexClass, ABCPanel, ABCSeries, ABCSparseArray, + ABCSparseSeries) +from pandas.core.dtypes.missing import isna, notna +import pandas as pd +import pandas.core.common as com +import pandas.core.missing as missing # ----------------------------------------------------------------------------- # Ops Wrapping Utilities + def get_op_result_name(left, right): """ Find the appropriate name to pin to an operation result. This result @@ -88,7 +79,7 @@ def _maybe_match_name(a, b): ------- name : str or None - See also + See Also -------- pandas.core.common.consensus_name_attr """ @@ -130,6 +121,13 @@ def maybe_upcast_for_op(obj): # implementation; otherwise operation against numeric-dtype # raises TypeError return pd.Timedelta(obj) + elif isinstance(obj, np.timedelta64) and not isna(obj): + # In particular non-nanosecond timedelta64 needs to be cast to + # nanoseconds, or else we get undesired behavior like + # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') + # The isna check is to avoid casting timedelta64("NaT"), which would + # return NaT and incorrectly be treated as a datetime-NaT. + return pd.Timedelta(obj) elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj): # GH#22390 Unfortunately we need to special-case right-hand # timedelta64 dtypes because numpy casts integer dtypes to @@ -237,7 +235,7 @@ def _gen_eval_kwargs(name): {} >>> _gen_eval_kwargs("rtruediv") - {"reversed": True, "truediv": True} + {'reversed': True, 'truediv': True} """ kwargs = {} @@ -386,124 +384,21 @@ def _get_op_name(op, special): # ----------------------------------------------------------------------------- # Docstring Generation and Templates -_add_example_FRAME = """ ->>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], -... columns=['one']) ->>> a - one -a 1.0 -b 1.0 -c 1.0 -d NaN ->>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], -... two=[np.nan, 2, np.nan, 2]), -... index=['a', 'b', 'd', 'e']) ->>> b - one two -a 1.0 NaN -b NaN 2.0 -d 1.0 NaN -e NaN 2.0 ->>> a.add(b, fill_value=0) - one two -a 2.0 NaN -b 1.0 2.0 -c 1.0 NaN -d 1.0 NaN -e NaN 2.0 -""" - -_sub_example_FRAME = """ ->>> a = pd.DataFrame([2, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], -... columns=['one']) ->>> a - one -a 2.0 -b 1.0 -c 1.0 -d NaN ->>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], -... two=[3, 2, np.nan, 2]), -... index=['a', 'b', 'd', 'e']) ->>> b - one two -a 1.0 3.0 -b NaN 2.0 -d 1.0 NaN -e NaN 2.0 ->>> a.sub(b, fill_value=0) - one two -a 1.0 -3.0 -b 1.0 -2.0 -c 1.0 NaN -d -1.0 NaN -e NaN -2.0 -""" - -_mod_example_FRAME = """ -**Using a scalar argument** - ->>> df = pd.DataFrame([2, 4, np.nan, 6.2], index=["a", "b", "c", "d"], -... columns=['one']) ->>> df - one -a 2.0 -b 4.0 -c NaN -d 6.2 ->>> df.mod(3, fill_value=-1) - one -a 2.0 -b 1.0 -c 2.0 -d 0.2 - -**Using a DataFrame argument** - ->>> df = pd.DataFrame(dict(one=[np.nan, 2, 3, 14], two=[np.nan, 1, 1, 3]), -... index=['a', 'b', 'c', 'd']) ->>> df - one two -a NaN NaN -b 2.0 1.0 -c 3.0 1.0 -d 14.0 3.0 ->>> other = pd.DataFrame(dict(one=[np.nan, np.nan, 6, np.nan], -... three=[np.nan, 10, np.nan, -7]), -... index=['a', 'b', 'd', 'e']) ->>> other - one three -a NaN NaN -b NaN 10.0 -d 6.0 NaN -e NaN -7.0 ->>> df.mod(other, fill_value=3) - one three two -a NaN NaN NaN -b 2.0 3.0 1.0 -c 0.0 NaN 1.0 -d 2.0 NaN 0.0 -e NaN -4.0 NaN -""" - _op_descriptions = { # Arithmetic Operators 'add': {'op': '+', 'desc': 'Addition', - 'reverse': 'radd', - 'df_examples': _add_example_FRAME}, + 'reverse': 'radd'}, 'sub': {'op': '-', 'desc': 'Subtraction', - 'reverse': 'rsub', - 'df_examples': _sub_example_FRAME}, + 'reverse': 'rsub'}, 'mul': {'op': '*', 'desc': 'Multiplication', 'reverse': 'rmul', 'df_examples': None}, 'mod': {'op': '%', 'desc': 'Modulo', - 'reverse': 'rmod', - 'df_examples': _mod_example_FRAME}, + 'reverse': 'rmod'}, 'pow': {'op': '**', 'desc': 'Exponential power', 'reverse': 'rpow', @@ -518,34 +413,29 @@ def _get_op_name(op, special): 'df_examples': None}, 'divmod': {'op': 'divmod', 'desc': 'Integer division and modulo', - 'reverse': None, + 'reverse': 'rdivmod', 'df_examples': None}, # Comparison Operators 'eq': {'op': '==', 'desc': 'Equal to', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'ne': {'op': '!=', 'desc': 'Not equal to', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'lt': {'op': '<', 'desc': 'Less than', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'le': {'op': '<=', 'desc': 'Less than or equal to', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'gt': {'op': '>', 'desc': 'Greater than', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'ge': {'op': '>=', 'desc': 'Greater than or equal to', - 'reverse': None, - 'df_examples': None}} + 'reverse': None} +} _op_names = list(_op_descriptions.keys()) for key in _op_names: @@ -602,7 +492,7 @@ def _get_op_name(op, special): e NaN dtype: float64 -See also +See Also -------- Series.{reverse} """ @@ -637,38 +527,295 @@ def _get_op_name(op, special): _flex_doc_FRAME = """ {desc} of dataframe and other, element-wise (binary operator `{op_name}`). -Equivalent to ``{equiv}``, but with support to substitute a fill_value for -missing data in one of the inputs. +Equivalent to ``{equiv}``, but with support to substitute a fill_value +for missing data in one of the inputs. With reverse version, `{reverse}`. + +Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to +arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**. Parameters ---------- -other : Series, DataFrame, or constant -axis : {{0, 1, 'index', 'columns'}} - For Series input, axis to match Series index on -level : int or name +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}} + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). For Series input, axis to match Series index on. +level : int or label Broadcast across a level, matching Index values on the - passed MultiIndex level -fill_value : None or float value, default None + passed MultiIndex level. +fill_value : float or None, default None Fill existing missing (NaN) values, and any new element needed for successful DataFrame alignment, with this value before computation. If data in both corresponding DataFrame locations is missing - the result will be missing + the result will be missing. Notes ----- -Mismatched indices will be unioned together +Mismatched indices will be unioned together. Returns ------- -result : DataFrame +DataFrame + Result of the arithmetic operation. + +See Also +-------- +DataFrame.add : Add DataFrames. +DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power. Examples -------- -{df_examples} +>>> df = pd.DataFrame({{'angles': [0, 3, 4], +... 'degrees': [360, 180, 360]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> df + angles degrees +circle 0 360 +triangle 3 180 +rectangle 4 360 + +Add a scalar with operator version which return the same +results. + +>>> df + 1 + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +>>> df.add(1) + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +Divide by constant with reverse version. + +>>> df.div(10) + angles degrees +circle 0.0 36.0 +triangle 0.3 18.0 +rectangle 0.4 36.0 + +>>> df.rdiv(10) + angles degrees +circle inf 0.027778 +triangle 3.333333 0.055556 +rectangle 2.500000 0.027778 + +Subtract a list and Series by axis with operator version. + +>>> df - [1, 2] + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub([1, 2], axis='columns') + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), +... axis='index') + angles degrees +circle -1 359 +triangle 2 179 +rectangle 3 359 + +Multiply a DataFrame of different shape with operator version. + +>>> other = pd.DataFrame({{'angles': [0, 3, 4]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> other + angles +circle 0 +triangle 3 +rectangle 4 + +>>> df * other + angles degrees +circle 0 NaN +triangle 9 NaN +rectangle 16 NaN + +>>> df.mul(other, fill_value=0) + angles degrees +circle 0 0.0 +triangle 9 0.0 +rectangle 16 0.0 + +Divide by a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6], +... 'degrees': [360, 180, 360, 360, 540, 720]}}, +... index=[['A', 'A', 'A', 'B', 'B', 'B'], +... ['circle', 'triangle', 'rectangle', +... 'square', 'pentagon', 'hexagon']]) +>>> df_multindex + angles degrees +A circle 0 360 + triangle 3 180 + rectangle 4 360 +B square 4 360 + pentagon 5 540 + hexagon 6 720 + +>>> df.div(df_multindex, level=1, fill_value=0) + angles degrees +A circle NaN 1.0 + triangle 1.0 1.0 + rectangle 1.0 1.0 +B square 0.0 0.0 + pentagon 0.0 0.0 + hexagon 0.0 0.0 +""" + +_flex_comp_doc_FRAME = """ +{desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison +operators. + +Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis +(rows or columns) and level for comparison. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). +level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + +Returns +------- +DataFrame of bool + Result of the comparison. + +See Also +-------- +DataFrame.eq : Compare DataFrames for equality elementwise. +DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. -See also +Notes -------- -DataFrame.{reverse} +Mismatched indices will be unioned together. +`NaN` values are considered different (i.e. `NaN` != `NaN`). + +Examples +-------- +>>> df = pd.DataFrame({{'cost': [250, 150, 100], +... 'revenue': [100, 250, 300]}}, +... index=['A', 'B', 'C']) +>>> df + cost revenue +A 250 100 +B 150 250 +C 100 300 + +Compare to a scalar and operator version which return the same +results. + +>>> df == 100 + cost revenue +A False True +B False False +C True False + +>>> df.eq(100) + cost revenue +A False True +B False False +C True False + +Compare to a list and Series by axis and operator version. As shown, +for list axis is by default 'index', but for Series axis is by +default 'columns'. + +>>> df != [100, 250, 300] + cost revenue +A True False +B True False +C True False + +>>> df.ne([100, 250, 300], axis='index') + cost revenue +A True False +B True False +C True False + +>>> df != pd.Series([100, 250, 300]) + cost revenue 0 1 2 +A True True True True True +B True True True True True +C True True True True True + +>>> df.ne(pd.Series([100, 250, 300]), axis='columns') + cost revenue 0 1 2 +A True True True True True +B True True True True True +C True True True True True + +Compare to a DataFrame of different shape. + +>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}}, +... index=['A', 'B', 'C', 'D']) +>>> other + revenue +A 300 +B 250 +C 100 +D 150 + +>>> df.gt(other) + cost revenue +A False False +B False False +C False True +D False False + +Compare to a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220], +... 'revenue': [100, 250, 300, 200, 175, 225]}}, +... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'], +... ['A', 'B', 'C', 'A', 'B' ,'C']]) +>>> df_multindex + cost revenue +Q1 A 250 100 + B 150 250 + C 100 300 +Q2 A 150 200 + B 300 175 + C 220 225 + +>>> df.le(df_multindex, level=1) + cost revenue +Q1 A True True + B True True + C True True +Q2 A False True + B True False + C True False """ _flex_doc_PANEL = """ @@ -685,7 +832,7 @@ def _get_op_name(op, special): ------- Panel -See also +See Also -------- Panel.{reverse} """ @@ -736,8 +883,7 @@ def _make_flex_doc(op_name, typ): elif typ == 'dataframe': base_doc = _flex_doc_FRAME doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, - equiv=equiv, reverse=op_desc['reverse'], - df_examples=op_desc['df_examples']) + equiv=equiv, reverse=op_desc['reverse']) elif typ == 'panel': base_doc = _flex_doc_PANEL doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, @@ -862,6 +1008,13 @@ def masked_arith_op(x, y, op): # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) + + # 1 ** np.nan is 1. So we have to unmask those. + if op == pow: + mask = np.where(x == 1, False, mask) + elif op == rpow: + mask = np.where(y == 1, False, mask) + if mask.any(): with np.errstate(all='ignore'): result[mask] = op(xrav[mask], y) @@ -933,9 +1086,141 @@ def should_series_dispatch(left, right, op): # numpy integer dtypes as timedelta64 dtypes in this scenario return True + if is_datetime64_dtype(ldtype) and is_object_dtype(rdtype): + # in particular case where right is an array of DateOffsets + return True + return False +def dispatch_to_series(left, right, func, str_rep=None, axis=None): + """ + Evaluate the frame operation func(left, right) by evaluating + column-by-column, dispatching to the Series implementation. + + Parameters + ---------- + left : DataFrame + right : scalar or DataFrame + func : arithmetic or comparison operator + str_rep : str or None, default None + axis : {None, 0, 1, "index", "columns"} + + Returns + ------- + DataFrame + """ + # Note: we use iloc to access columns for compat with cases + # with non-unique columns. + import pandas.core.computation.expressions as expressions + + right = lib.item_from_zerodim(right) + if lib.is_scalar(right) or np.ndim(right) == 0: + + def column_op(a, b): + return {i: func(a.iloc[:, i], b) + for i in range(len(a.columns))} + + elif isinstance(right, ABCDataFrame): + assert right._indexed_same(left) + + def column_op(a, b): + return {i: func(a.iloc[:, i], b.iloc[:, i]) + for i in range(len(a.columns))} + + elif isinstance(right, ABCSeries) and axis == "columns": + # We only get here if called via left._combine_match_columns, + # in which case we specifically want to operate row-by-row + assert right.index.equals(left.columns) + + def column_op(a, b): + return {i: func(a.iloc[:, i], b.iloc[i]) + for i in range(len(a.columns))} + + elif isinstance(right, ABCSeries): + assert right.index.equals(left.index) # Handle other cases later + + def column_op(a, b): + return {i: func(a.iloc[:, i], b) + for i in range(len(a.columns))} + + else: + # Remaining cases have less-obvious dispatch rules + raise NotImplementedError(right) + + new_data = expressions.evaluate(column_op, str_rep, left, right) + + result = left._constructor(new_data, index=left.index, copy=False) + # Pin columns instead of passing to constructor for compat with + # non-unique columns case + result.columns = left.columns + return result + + +def dispatch_to_index_op(op, left, right, index_class): + """ + Wrap Series left in the given index_class to delegate the operation op + to the index implementation. DatetimeIndex and TimedeltaIndex perform + type checking, timezone handling, overflow checks, etc. + + Parameters + ---------- + op : binary operator (operator.add, operator.sub, ...) + left : Series + right : object + index_class : DatetimeIndex or TimedeltaIndex + + Returns + ------- + result : object, usually DatetimeIndex, TimedeltaIndex, or Series + """ + left_idx = index_class(left) + + # avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes, + # left_idx may inherit a freq from a cached DatetimeIndex. + # See discussion in GH#19147. + if getattr(left_idx, 'freq', None) is not None: + left_idx = left_idx._shallow_copy(freq=None) + try: + result = op(left_idx, right) + except NullFrequencyError: + # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError + # on add/sub of integers (or int-like). We re-raise as a TypeError. + raise TypeError('incompatible type for a datetime/timedelta ' + 'operation [{name}]'.format(name=op.__name__)) + return result + + +def dispatch_to_extension_op(op, left, right): + """ + Assume that left or right is a Series backed by an ExtensionArray, + apply the operator defined by op. + """ + + # The op calls will raise TypeError if the op is not defined + # on the ExtensionArray + + # unbox Series and Index to arrays + if isinstance(left, (ABCSeries, ABCIndexClass)): + new_left = left._values + else: + new_left = left + + if isinstance(right, (ABCSeries, ABCIndexClass)): + new_right = right._values + else: + new_right = right + + res_values = op(new_left, new_right) + res_name = get_op_result_name(left, right) + + if op.__name__ in ['divmod', 'rdivmod']: + return _construct_divmod_result( + left, res_values, left.index, res_name) + + return _construct_result(left, res_values, left.index, res_name) + + # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory # methods @@ -1032,6 +1317,7 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): if have_divmod: # divmod doesn't have an op that is supported by numexpr new_methods['divmod'] = arith_method(cls, divmod, special) + new_methods['rdivmod'] = arith_method(cls, rdivmod, special) new_methods.update(dict( eq=comp_method(cls, operator.eq, special), @@ -1194,49 +1480,6 @@ def _construct_divmod_result(left, result, index, name, dtype=None): ) -def dispatch_to_extension_op(op, left, right): - """ - Assume that left or right is a Series backed by an ExtensionArray, - apply the operator defined by op. - """ - - # The op calls will raise TypeError if the op is not defined - # on the ExtensionArray - # TODO(jreback) - # we need to listify to avoid ndarray, or non-same-type extension array - # dispatching - - if is_extension_array_dtype(left): - - new_left = left.values - if isinstance(right, np.ndarray): - - # handle numpy scalars, this is a PITA - # TODO(jreback) - new_right = lib.item_from_zerodim(right) - if is_scalar(new_right): - new_right = [new_right] - new_right = list(new_right) - elif is_extension_array_dtype(right) and type(left) != type(right): - new_right = list(right) - else: - new_right = right - - else: - - new_left = list(left.values) - new_right = right - - res_values = op(new_left, new_right) - res_name = get_op_result_name(left, right) - - if op.__name__ == 'divmod': - return _construct_divmod_result( - left, res_values, left.index, res_name) - - return _construct_result(left, res_values, left.index, res_name) - - def _arith_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid @@ -1247,7 +1490,7 @@ def _arith_method_SERIES(cls, op, special): eval_kwargs = _gen_eval_kwargs(op_name) fill_zeros = _gen_fill_zeros(op_name) construct_result = (_construct_divmod_result - if op is divmod else _construct_result) + if op in [divmod, rdivmod] else _construct_result) def na_op(x, y): import pandas.core.computation.expressions as expressions @@ -1309,14 +1552,14 @@ def wrapper(left, right): elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) return construct_result(left, result, - index=left.index, name=res_name, - dtype=result.dtype) + index=left.index, name=res_name) - elif is_timedelta64_dtype(right) and not is_scalar(right): - # i.e. exclude np.timedelta64 object + elif is_timedelta64_dtype(right): + # We should only get here with non-scalar or timedelta64('NaT') + # values for right # Note: we cannot use dispatch_to_index_op because - # that may incorrectly raise TypeError when we - # should get NullFrequencyError + # that may incorrectly raise TypeError when we + # should get NullFrequencyError result = op(pd.Index(left), right) return construct_result(left, result, index=left.index, name=res_name, @@ -1334,40 +1577,6 @@ def wrapper(left, right): return wrapper -def dispatch_to_index_op(op, left, right, index_class): - """ - Wrap Series left in the given index_class to delegate the operation op - to the index implementation. DatetimeIndex and TimedeltaIndex perform - type checking, timezone handling, overflow checks, etc. - - Parameters - ---------- - op : binary operator (operator.add, operator.sub, ...) - left : Series - right : object - index_class : DatetimeIndex or TimedeltaIndex - - Returns - ------- - result : object, usually DatetimeIndex, TimedeltaIndex, or Series - """ - left_idx = index_class(left) - - # avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes, - # left_idx may inherit a freq from a cached DatetimeIndex. - # See discussion in GH#19147. - if getattr(left_idx, 'freq', None) is not None: - left_idx = left_idx._shallow_copy(freq=None) - try: - result = op(left_idx, right) - except NullFrequencyError: - # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError - # on add/sub of integers (or int-like). We re-raise as a TypeError. - raise TypeError('incompatible type for a datetime/timedelta ' - 'operation [{name}]'.format(name=op.__name__)) - return result - - def _comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) @@ -1666,75 +1875,12 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # ----------------------------------------------------------------------------- # DataFrame -def dispatch_to_series(left, right, func, str_rep=None, axis=None): - """ - Evaluate the frame operation func(left, right) by evaluating - column-by-column, dispatching to the Series implementation. - - Parameters - ---------- - left : DataFrame - right : scalar or DataFrame - func : arithmetic or comparison operator - str_rep : str or None, default None - axis : {None, 0, 1, "index", "columns"} - - Returns - ------- - DataFrame - """ - # Note: we use iloc to access columns for compat with cases - # with non-unique columns. - import pandas.core.computation.expressions as expressions - - right = lib.item_from_zerodim(right) - if lib.is_scalar(right): - - def column_op(a, b): - return {i: func(a.iloc[:, i], b) - for i in range(len(a.columns))} - - elif isinstance(right, ABCDataFrame): - assert right._indexed_same(left) - - def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[:, i]) - for i in range(len(a.columns))} - - elif isinstance(right, ABCSeries) and axis == "columns": - # We only get here if called via left._combine_match_columns, - # in which case we specifically want to operate row-by-row - assert right.index.equals(left.columns) - - def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[i]) - for i in range(len(a.columns))} - - elif isinstance(right, ABCSeries): - assert right.index.equals(left.index) # Handle other cases later - - def column_op(a, b): - return {i: func(a.iloc[:, i], b) - for i in range(len(a.columns))} - - else: - # Remaining cases have less-obvious dispatch rules - raise NotImplementedError(right) - - new_data = expressions.evaluate(column_op, str_rep, left, right) - - result = left._constructor(new_data, index=left.index, copy=False) - # Pin columns instead of passing to constructor for compat with - # non-unique columns case - result.columns = left.columns - return result - def _combine_series_frame(self, other, func, fill_value=None, axis=None, - level=None, try_cast=True): + level=None): """ Apply binary operator `func` to self, other using alignment and fill - conventions determined by the fill_value, axis, level, and try_cast kwargs. + conventions determined by the fill_value, axis, and level kwargs. Parameters ---------- @@ -1744,7 +1890,6 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, fill_value : object, default None axis : {0, 1, 'columns', 'index', None}, default None level : int or None, default None - try_cast : bool, default True Returns ------- @@ -1759,8 +1904,7 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, if axis == 0: return self._combine_match_index(other, func, level=level) else: - return self._combine_match_columns(other, func, level=level, - try_cast=try_cast) + return self._combine_match_columns(other, func, level=level) else: if not len(other): return self * np.nan @@ -1771,8 +1915,7 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, columns=self.columns) # default axis is columns - return self._combine_match_columns(other, func, level=level, - try_cast=try_cast) + return self._combine_match_columns(other, func, level=level) def _align_method_FRAME(left, right, axis): @@ -1805,12 +1948,7 @@ def to_series(right): elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns - try: - right = np.broadcast_to(right, left.shape) - except AttributeError: - # numpy < 1.10.0 - right = np.tile(right, (1, left.shape[1])) - + right = np.broadcast_to(right, left.shape) right = left._constructor(right, index=left.index, columns=left.columns) @@ -1877,13 +2015,13 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): pass_op = op if axis in [0, "columns", None] else na_op return _combine_series_frame(self, other, pass_op, fill_value=fill_value, axis=axis, - level=level, try_cast=True) + level=level) else: if fill_value is not None: self = self.fillna(fill_value) - pass_op = op if lib.is_scalar(other) else na_op - return self._combine_const(other, pass_op, try_cast=True) + assert np.ndim(other) == 0 + return self._combine_const(other, op) f.__name__ = op_name @@ -1903,8 +2041,10 @@ def na_op(x, y): result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries)) return result - @Appender('Wrapper for flexible comparison methods {name}' - .format(name=op_name)) + doc = _flex_comp_doc_FRAME.format(op_name=op_name, + desc=_op_descriptions[op_name]['desc']) + + @Appender(doc) def f(self, other, axis=default_axis, level=None): other = _align_method_FRAME(self, other, axis) @@ -1919,9 +2059,10 @@ def f(self, other, axis=default_axis, level=None): elif isinstance(other, ABCSeries): return _combine_series_frame(self, other, na_op, fill_value=None, axis=axis, - level=level, try_cast=False) + level=level) else: - return self._combine_const(other, na_op, try_cast=False) + assert np.ndim(other) == 0, other + return self._combine_const(other, na_op) f.__name__ = op_name @@ -1934,6 +2075,9 @@ def _comp_method_FRAME(cls, func, special): @Appender('Wrapper for comparison method {name}'.format(name=op_name)) def f(self, other): + + other = _align_method_FRAME(self, other, axis=None) + if isinstance(other, ABCDataFrame): # Another DataFrame if not self._indexed_same(other): @@ -1944,14 +2088,12 @@ def f(self, other): elif isinstance(other, ABCSeries): return _combine_series_frame(self, other, func, fill_value=None, axis=None, - level=None, try_cast=False) + level=None) else: # straight boolean comparisons we want to allow all columns # (regardless of dtype to pass thru) See #4537 for discussion. - res = self._combine_const(other, func, - errors='ignore', - try_cast=False) + res = self._combine_const(other, func) return res.fillna(True).astype(bool) f.__name__ = op_name @@ -1998,13 +2140,13 @@ def f(self, other, axis=None): self._get_axis_number(axis) if isinstance(other, self._constructor): - return self._compare_constructor(other, na_op, try_cast=False) + return self._compare_constructor(other, na_op) elif isinstance(other, (self._constructor_sliced, ABCDataFrame, ABCSeries)): raise Exception("input needs alignment for this object [{object}]" .format(object=self._constructor)) else: - return self._combine_const(other, na_op, try_cast=False) + return self._combine_const(other, na_op) f.__name__ = op_name @@ -2066,16 +2208,19 @@ def _cast_sparse_series_op(left, right, opname): left : SparseArray right : SparseArray """ + from pandas.core.sparse.api import SparseDtype + opname = opname.strip('_') + # TODO: This should be moved to the array? if is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf if opname in ('floordiv', 'mod') and (right.values == 0).any(): - left = left.astype(np.float64) - right = right.astype(np.float64) + left = left.astype(SparseDtype(np.float64, left.fill_value)) + right = right.astype(SparseDtype(np.float64, right.fill_value)) elif opname in ('rfloordiv', 'rmod') and (left.values == 0).any(): - left = left.astype(np.float64) - right = right.astype(np.float64) + left = left.astype(SparseDtype(np.float64, left.fill_value)) + right = right.astype(SparseDtype(np.float64, right.fill_value)) return left, right @@ -2113,7 +2258,7 @@ def _sparse_series_op(left, right, op, name): new_index = left.index new_name = get_op_result_name(left, right) - from pandas.core.sparse.array import _sparse_array_op + from pandas.core.arrays.sparse import _sparse_array_op lvalues, rvalues = _cast_sparse_series_op(left.values, right.values, name) result = _sparse_array_op(lvalues, rvalues, op, name) return left._constructor(result, index=new_index, name=new_name) @@ -2127,7 +2272,7 @@ def _arith_method_SPARSE_ARRAY(cls, op, special): op_name = _get_op_name(op, special) def wrapper(self, other): - from pandas.core.sparse.array import ( + from pandas.core.arrays.sparse.array import ( SparseArray, _sparse_array_op, _wrap_result, _get_fill) if isinstance(other, np.ndarray): if len(self) != len(other): diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 1e2d4000413bb..bb3412a3d7c0c 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -4,36 +4,37 @@ # pylint: disable=E1103,W0231,W0212,W0621 from __future__ import division -import numpy as np import warnings + +import numpy as np + +import pandas.compat as compat +from pandas.compat import OrderedDict, map, range, u, zip +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg +from pandas.util._validators import validate_axis_style_args + from pandas.core.dtypes.cast import ( - infer_dtype_from_scalar, - cast_scalar_to_array, - maybe_cast_item) + cast_scalar_to_array, infer_dtype_from_scalar, maybe_cast_item) from pandas.core.dtypes.common import ( - is_integer, is_list_like, - is_string_like, is_scalar) + is_integer, is_list_like, is_scalar, is_string_like) from pandas.core.dtypes.missing import notna -import pandas.core.ops as ops import pandas.core.common as com -import pandas.core.indexes.base as ibase -from pandas import compat -from pandas.compat import (map, zip, range, u, OrderedDict) -from pandas.compat.numpy import function as nv from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import (Index, MultiIndex, ensure_index, - _get_objs_combined_axis) -from pandas.io.formats.printing import pprint_thing +from pandas.core.index import ( + Index, MultiIndex, _get_objs_combined_axis, ensure_index) +import pandas.core.indexes.base as ibase from pandas.core.indexing import maybe_droplevels -from pandas.core.internals import (BlockManager, - create_block_manager_from_arrays, - create_block_manager_from_blocks) -from pandas.core.series import Series +from pandas.core.internals import ( + BlockManager, create_block_manager_from_arrays, + create_block_manager_from_blocks) +import pandas.core.ops as ops from pandas.core.reshape.util import cartesian_product -from pandas.util._decorators import Appender, Substitution -from pandas.util._validators import validate_axis_style_args +from pandas.core.series import Series + +from pandas.io.formats.printing import pprint_thing _shared_doc_kwargs = dict( axes='items, major_axis, minor_axis', @@ -47,7 +48,7 @@ def _ensure_like_indices(time, panels): """ - Makes sure that time and panels are conformable + Makes sure that time and panels are conformable. """ n_time = len(time) n_panel = len(panels) @@ -62,7 +63,7 @@ def _ensure_like_indices(time, panels): def panel_index(time, panels, names=None): """ - Returns a multi-index suitable for a panel-like DataFrame + Returns a multi-index suitable for a panel-like DataFrame. Parameters ---------- @@ -106,14 +107,14 @@ def panel_index(time, panels, names=None): class Panel(NDFrame): """ - Represents wide format panel data, stored as 3-dimensional array + Represents wide format panel data, stored as 3-dimensional array. - .. deprecated:: 0.20.0 - The recommended way to represent 3-D data are with a MultiIndex on a - DataFrame via the :attr:`~Panel.to_frame()` method or with the - `xarray package `__. - Pandas provides a :attr:`~Panel.to_xarray()` method to automate this - conversion. + .. deprecated:: 0.20.0 + The recommended way to represent 3-D data are with a MultiIndex on a + DataFrame via the :attr:`~Panel.to_frame()` method or with the + `xarray package `__. + Pandas provides a :attr:`~Panel.to_xarray()` method to automate this + conversion. Parameters ---------- @@ -124,10 +125,10 @@ class Panel(NDFrame): axis=1 minor_axis : Index or array-like axis=2 - dtype : dtype, default None - Data type to force, otherwise infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input + dtype : dtype, default None + Data type to force, otherwise infer """ @property @@ -156,7 +157,7 @@ def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, def _init_data(self, data, copy, dtype, **kwargs): """ Generate ND initialization; axes are passed - as required objects to __init__ + as required objects to __init__. """ if data is None: data = {} @@ -241,7 +242,7 @@ def _init_arrays(self, arrays, arr_names, axes): @classmethod def from_dict(cls, data, intersect=False, orient='items', dtype=None): """ - Construct Panel from dict of DataFrame objects + Construct Panel from dict of DataFrame objects. Parameters ---------- @@ -330,14 +331,13 @@ def _init_matrix(self, data, axes, dtype=None, copy=False): # ---------------------------------------------------------------------- # Comparison methods - def _compare_constructor(self, other, func, try_cast=True): + def _compare_constructor(self, other, func): if not self._indexed_same(other): raise Exception('Can only compare identically-labeled ' 'same type objects') - new_data = {} - for col in self._info_axis: - new_data[col] = func(self[col], other[col]) + new_data = {col: func(self[col], other[col]) + for col in self._info_axis} d = self._construct_axes_dict(copy=False) return self._constructor(data=new_data, **d) @@ -347,7 +347,7 @@ def _compare_constructor(self, other, func, try_cast=True): def __unicode__(self): """ - Return a string representation for a particular Panel + Return a string representation for a particular Panel. Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. @@ -376,7 +376,7 @@ def _get_plane_axes_index(self, axis): """ Get my plane axes indexes: these are already (as compared with higher level planes), - as we are returning a DataFrame axes indexes + as we are returning a DataFrame axes indexes. """ axis_name = self._get_axis_name(axis) @@ -396,7 +396,7 @@ def _get_plane_axes(self, axis): """ Get my plane axes indexes: these are already (as compared with higher level planes), - as we are returning a DataFrame axes + as we are returning a DataFrame axes. """ return [self._get_axis(axi) for axi in self._get_plane_axes_index(axis)] @@ -408,14 +408,14 @@ def to_sparse(self, *args, **kwargs): NOT IMPLEMENTED: do not call this method, as sparsifying is not supported for Panel objects and will raise an error. - Convert to SparsePanel + Convert to SparsePanel. """ raise NotImplementedError("sparsifying is not supported " "for Panel objects") def to_excel(self, path, na_rep='', engine=None, **kwargs): """ - Write each DataFrame in Panel to a separate excel sheet + Write each DataFrame in Panel to a separate excel sheet. Parameters ---------- @@ -472,7 +472,8 @@ def as_matrix(self): # Getting and setting elements def get_value(self, *args, **kwargs): - """Quickly retrieve single value at (item, major, minor) location + """ + Quickly retrieve single value at (item, major, minor) location. .. deprecated:: 0.21.0 @@ -519,7 +520,8 @@ def _get_value(self, *args, **kwargs): _get_value.__doc__ = get_value.__doc__ def set_value(self, *args, **kwargs): - """Quickly set single value at (item, major, minor) location + """ + Quickly set single value at (item, major, minor) location. .. deprecated:: 0.21.0 @@ -618,7 +620,9 @@ def __setitem__(self, key, value): NDFrame._set_item(self, key, mat) def _unpickle_panel_compat(self, state): # pragma: no cover - "Unpickle the panel" + """ + Unpickle the panel. + """ from pandas.io.pickle import _unpickle_array _unpickle = _unpickle_array @@ -686,7 +690,9 @@ def round(self, decimals=0, *args, **kwargs): raise TypeError("decimals must be an integer") def _needs_reindex_multi(self, axes, method, level): - """ don't allow a multi reindex on Panel or above ndim """ + """ + Don't allow a multi reindex on Panel or above ndim. + """ return False def align(self, other, **kwargs): @@ -694,7 +700,7 @@ def align(self, other, **kwargs): def dropna(self, axis=0, how='any', inplace=False): """ - Drop 2D from panel, holding passed axis constant + Drop 2D from panel, holding passed axis constant. Parameters ---------- @@ -745,13 +751,13 @@ def _combine(self, other, func, axis=0): "{otype!s} is not supported in combine operation with " "{selftype!s}".format(otype=type(other), selftype=type(self))) - def _combine_const(self, other, func, try_cast=True): + def _combine_const(self, other, func): with np.errstate(all='ignore'): new_values = func(self.values, other) d = self._construct_axes_dict() return self._constructor(new_values, **d) - def _combine_frame(self, other, func, axis=0, try_cast=True): + def _combine_frame(self, other, func, axis=0): index, columns = self._get_plane_axes(axis) axis = self._get_axis_number(axis) @@ -770,7 +776,7 @@ def _combine_frame(self, other, func, axis=0, try_cast=True): return self._constructor(new_values, self.items, self.major_axis, self.minor_axis) - def _combine_panel(self, other, func, try_cast=True): + def _combine_panel(self, other, func): items = self.items.union(other.items) major = self.major_axis.union(other.major_axis) minor = self.minor_axis.union(other.minor_axis) @@ -786,7 +792,7 @@ def _combine_panel(self, other, func, try_cast=True): def major_xs(self, key): """ - Return slice of panel along major axis + Return slice of panel along major axis. Parameters ---------- @@ -805,13 +811,12 @@ def major_xs(self, key): MultiIndex Slicers is a generic way to get/set values on any level or levels and is a superset of major_xs functionality, see :ref:`MultiIndex Slicers ` - """ return self.xs(key, axis=self._AXIS_LEN - 2) def minor_xs(self, key): """ - Return slice of panel along minor axis + Return slice of panel along minor axis. Parameters ---------- @@ -830,13 +835,12 @@ def minor_xs(self, key): MultiIndex Slicers is a generic way to get/set values on any level or levels and is a superset of minor_xs functionality, see :ref:`MultiIndex Slicers ` - """ return self.xs(key, axis=self._AXIS_LEN - 1) def xs(self, key, axis=1): """ - Return slice of panel along selected axis + Return slice of panel along selected axis. Parameters ---------- @@ -855,7 +859,6 @@ def xs(self, key, axis=1): MultiIndex Slicers is a generic way to get/set values on any level or levels and is a superset of xs functionality, see :ref:`MultiIndex Slicers ` - """ axis = self._get_axis_number(axis) if axis == 0: @@ -873,6 +876,8 @@ def xs(self, key, axis=1): def _ixs(self, i, axis=0): """ + Parameters + ---------- i : int, slice, or sequence of integers axis : int """ @@ -900,7 +905,7 @@ def _ixs(self, i, axis=0): def groupby(self, function, axis='major'): """ - Group data on given axis, returning GroupBy object + Group data on given axis, returning GroupBy object. Parameters ---------- @@ -943,59 +948,58 @@ def to_frame(self, filter_observations=True): # size = N * K selector = slice(None, None) - data = {} - for item in self.items: - data[item] = self[item].values.ravel()[selector] + data = {item: self[item].values.ravel()[selector] + for item in self.items} def construct_multi_parts(idx, n_repeat, n_shuffle=1): # Replicates and shuffles MultiIndex, returns individual attributes - labels = [np.repeat(x, n_repeat) for x in idx.labels] + codes = [np.repeat(x, n_repeat) for x in idx.codes] # Assumes that each label is divisible by n_shuffle - labels = [x.reshape(n_shuffle, -1).ravel(order='F') - for x in labels] - labels = [x[selector] for x in labels] + codes = [x.reshape(n_shuffle, -1).ravel(order='F') + for x in codes] + codes = [x[selector] for x in codes] levels = idx.levels names = idx.names - return labels, levels, names + return codes, levels, names def construct_index_parts(idx, major=True): levels = [idx] if major: - labels = [np.arange(N).repeat(K)[selector]] + codes = [np.arange(N).repeat(K)[selector]] names = idx.name or 'major' else: - labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] - labels = [labels.ravel()[selector]] + codes = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] + codes = [codes.ravel()[selector]] names = idx.name or 'minor' names = [names] - return labels, levels, names + return codes, levels, names if isinstance(self.major_axis, MultiIndex): - major_labels, major_levels, major_names = construct_multi_parts( + major_codes, major_levels, major_names = construct_multi_parts( self.major_axis, n_repeat=K) else: - major_labels, major_levels, major_names = construct_index_parts( + major_codes, major_levels, major_names = construct_index_parts( self.major_axis) if isinstance(self.minor_axis, MultiIndex): - minor_labels, minor_levels, minor_names = construct_multi_parts( + minor_codes, minor_levels, minor_names = construct_multi_parts( self.minor_axis, n_repeat=N, n_shuffle=K) else: - minor_labels, minor_levels, minor_names = construct_index_parts( + minor_codes, minor_levels, minor_names = construct_index_parts( self.minor_axis, major=False) levels = major_levels + minor_levels - labels = major_labels + minor_labels + codes = major_codes + minor_codes names = major_names + minor_names - index = MultiIndex(levels=levels, labels=labels, names=names, + index = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) return DataFrame(data, index=index, columns=self.items) def apply(self, func, axis='major', **kwargs): """ - Applies function along axis (or axes) of the Panel + Applies function along axis (or axes) of the Panel. Parameters ---------- @@ -1013,21 +1017,21 @@ def apply(self, func, axis='major', **kwargs): Returns a Panel with the square root of each element - >>> p = pd.Panel(np.random.rand(4,3,2)) + >>> p = pd.Panel(np.random.rand(4, 3, 2)) # doctest: +SKIP >>> p.apply(np.sqrt) Equivalent to p.sum(1), returning a DataFrame - >>> p.apply(lambda x: x.sum(), axis=1) + >>> p.apply(lambda x: x.sum(), axis=1) # doctest: +SKIP Equivalent to previous: - >>> p.apply(lambda x: x.sum(), axis='major') + >>> p.apply(lambda x: x.sum(), axis='major') # doctest: +SKIP Return the shapes of each DataFrame over axis 2 (i.e the shapes of items x major), as a Series - >>> p.apply(lambda x: x.shape, axis=(0,1)) + >>> p.apply(lambda x: x.shape, axis=(0,1)) # doctest: +SKIP Returns ------- @@ -1117,8 +1121,9 @@ def _apply_1d(self, func, axis): return self._construct_return_type(results, planes) def _apply_2d(self, func, axis): - """ handle 2-d slices, equiv to iterating over the other axis """ - + """ + Handle 2-d slices, equiv to iterating over the other axis. + """ ndim = self.ndim axis = [self._get_axis_number(a) for a in axis] @@ -1174,7 +1179,9 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, return self._construct_return_type(result, axes) def _construct_return_type(self, result, axes=None): - """ return the type for the ndim of the result """ + """ + Return the type for the ndim of the result. + """ ndim = getattr(result, 'ndim', None) # need to assume they are the same @@ -1235,7 +1242,12 @@ def reindex(self, *args, **kwargs): kwargs.update(axes) kwargs.pop('axis', None) kwargs.pop('labels', None) - return super(Panel, self).reindex(**kwargs) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + # do not warn about constructing Panel when reindexing + result = super(Panel, self).reindex(**kwargs) + return result @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.rename.__doc__) @@ -1305,6 +1317,7 @@ def count(self, axis='major'): def shift(self, periods=1, freq=None, axis='major'): """ Shift index by desired number of periods with an optional time freq. + The shifted data will not include the dropped periods and the shifted axis will be smaller than the original. This is different from the behavior of DataFrame.shift() @@ -1330,7 +1343,7 @@ def tshift(self, periods=1, freq=None, axis='major'): def join(self, other, how='left', lsuffix='', rsuffix=''): """ - Join items with other Panel either on major and minor axes column + Join items with other Panel either on major and minor axes column. Parameters ---------- @@ -1377,25 +1390,37 @@ def join(self, other, how='left', lsuffix='', rsuffix=''): return concat([self] + list(other), axis=0, join=how, join_axes=join_axes, verify_integrity=True) + @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', + mapping={False: 'ignore', True: 'raise'}) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ - Modify Panel in place using non-NA values from passed - Panel, or object coercible to Panel. Aligns on items + Modify Panel in place using non-NA values from other Panel. + + May also use object coercible to Panel. Will align on items. Parameters ---------- other : Panel, or object coercible to Panel - join : How to join individual DataFrames - {'left', 'right', 'outer', 'inner'}, default 'left' - overwrite : boolean, default True - If True then overwrite values for common keys in the calling panel - filter_func : callable(1d-array) -> 1d-array, default None + The object from which the caller will be udpated. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + How individual DataFrames are joined. + overwrite : bool, default True + If True then overwrite values for common keys in the calling Panel. + filter_func : callable(1d-array) -> 1d-array, default None Can choose to replace values other than NA. Return True for values - that should be updated - raise_conflict : bool - If True, will raise an error if a DataFrame and other both - contain data in the same place. + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise an error if a DataFrame and other both. + + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + See Also + -------- + DataFrame.update : Similar method for DataFrames. + dict.update : Similar method for dictionaries. """ if not isinstance(other, self._constructor): @@ -1406,8 +1431,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, other = other.reindex(**{axis_name: axis_values}) for frame in axis_values: - self[frame].update(other[frame], join, overwrite, filter_func, - raise_conflict) + self[frame].update(other[frame], join=join, overwrite=overwrite, + filter_func=filter_func, errors=errors) def _get_join_index(self, other, how): if how == 'left': @@ -1425,13 +1450,17 @@ def _get_join_index(self, other, how): # miscellaneous data creation @staticmethod def _extract_axes(self, data, axes, **kwargs): - """ return a list of the axis indices """ + """ + Return a list of the axis indices. + """ return [self._extract_axis(self, data, axis=i, **kwargs) for i, a in enumerate(axes)] @staticmethod def _extract_axes_for_slice(self, axes): - """ return the slice dictionary for these axes """ + """ + Return the slice dictionary for these axes. + """ return {self._AXIS_SLICEMAP[i]: a for i, a in zip(self._AXIS_ORDERS[self._AXIS_LEN - len(axes):], axes)} diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 70a8deb33b7f2..f2cf17f8f060d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,37 +1,37 @@ -from datetime import timedelta -import numpy as np -import warnings import copy +from datetime import timedelta from textwrap import dedent +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import NaT, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas as pd +import pandas.core.algorithms as algos +from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin -from pandas.core.groupby.ops import BinGrouper +from pandas.core.groupby.generic import PanelGroupBy, SeriesGroupBy from pandas.core.groupby.groupby import ( - _GroupBy, GroupBy, groupby, _pipe_template -) + GroupBy, _GroupBy, _pipe_template, groupby) from pandas.core.groupby.grouper import Grouper -from pandas.core.groupby.generic import SeriesGroupBy, PanelGroupBy - -from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod +from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range -from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.tseries.offsets import (DateOffset, Tick, Day, - delta_to_nanoseconds, Nano) from pandas.core.indexes.period import PeriodIndex -from pandas.errors import AbstractMethodError -import pandas.core.algorithms as algos -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries - -import pandas.compat as compat -from pandas.compat.numpy import function as nv +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range -from pandas._libs import lib -from pandas._libs.tslibs import Timestamp, NaT -from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas.tseries.frequencies import is_subperiod, is_superperiod, to_offset +from pandas.tseries.offsets import ( + DateOffset, Day, Nano, Tick, delta_to_nanoseconds) -from pandas.util._decorators import Appender, Substitution -from pandas.core.generic import _shared_docs _shared_docs_kwargs = dict() @@ -81,7 +81,9 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.groupby._set_grouper(self._convert_obj(obj), sort=True) def __unicode__(self): - """ provide a nice str repr of our rolling object """ + """ + Provide a nice str repr of our rolling object. + """ attrs = ["{k}={v}".format(k=k, v=getattr(self.groupby, k)) for k in self._attributes if getattr(self.groupby, k, None) is not None] @@ -100,7 +102,7 @@ def __getattr__(self, attr): def __iter__(self): """ - Resampler iterator + Resampler iterator. Returns ------- @@ -110,7 +112,6 @@ def __iter__(self): See Also -------- GroupBy.__iter__ - """ self._set_binner() return super(Resampler, self).__iter__() @@ -125,14 +126,18 @@ def ax(self): @property def _typ(self): - """ masquerade for compat as a Series or a DataFrame """ + """ + Masquerade for compat as a Series or a DataFrame. + """ if isinstance(self._selected_obj, pd.Series): return 'series' return 'dataframe' @property def _from_selection(self): - """ is the resampling from a DataFrame column or MultiIndex level """ + """ + Is the resampling from a DataFrame column or MultiIndex level. + """ # upsampling and PeriodIndex resampling do not work # with selection, this state used to catch and raise an error return (self.groupby is not None and @@ -141,7 +146,7 @@ def _from_selection(self): def _convert_obj(self, obj): """ - provide any conversions for the object in order to correctly handle + Provide any conversions for the object in order to correctly handle. Parameters ---------- @@ -159,17 +164,17 @@ def _get_binner_for_time(self): def _set_binner(self): """ - setup our binners - cache these as we are an immutable object - """ + Setup our binners. + Cache these as we are an immutable object + """ if self.binner is None: self.binner, self.grouper = self._get_binner() def _get_binner(self): """ - create the BinGrouper, assume that self.set_grouper(obj) - has already been called + Create the BinGrouper, assume that self.set_grouper(obj) + has already been called. """ binner, bins, binlabels = self._get_binner_for_time() @@ -177,34 +182,36 @@ def _get_binner(self): return binner, bin_grouper def _assure_grouper(self): - """ make sure that we are creating our binner & grouper """ + """ + Make sure that we are creating our binner & grouper. + """ self._set_binner() @Substitution(klass='Resampler', versionadded='.. versionadded:: 0.23.0', examples=""" ->>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, -... index=pd.date_range('2012-08-02', periods=4)) ->>> df - A -2012-08-02 1 -2012-08-03 2 -2012-08-04 3 -2012-08-05 4 - -To get the difference between each 2-day period's maximum and minimum value in -one pass, you can do - ->>> df.resample('2D').pipe(lambda x: x.max() - x.min()) - A -2012-08-02 1 -2012-08-04 1""") + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each 2-day period's maximum and minimum + value in one pass, you can do + + >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 1 + 2012-08-04 1 + """) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super(Resampler, self).pipe(func, *args, **kwargs) _agg_doc = dedent(""" - Examples -------- >>> s = pd.Series([1,2,3,4,5], @@ -239,12 +246,11 @@ def pipe(self, func, *args, **kwargs): 2013-01-01 00:00:02 7 4.949747 2013-01-01 00:00:04 5 NaN - See also + See Also -------- pandas.DataFrame.groupby.aggregate pandas.DataFrame.resample.transform pandas.DataFrame.aggregate - """) @Appender(_agg_doc) @@ -273,7 +279,7 @@ def aggregate(self, func, *args, **kwargs): def transform(self, arg, *args, **kwargs): """ Call function producing a like-indexed Series on each group and return - a Series with the transformed values + a Series with the transformed values. Parameters ---------- @@ -299,8 +305,7 @@ def _upsample(self, f, limit=None, fill_value=None): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -323,7 +328,9 @@ def _gotitem(self, key, ndim, subset=None): return grouped def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): - """ re-evaluate the obj with a groupby aggregation """ + """ + Re-evaluate the obj with a groupby aggregation. + """ if grouper is None: self._set_binner() @@ -355,7 +362,7 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): def _apply_loffset(self, result): """ - if loffset is set, offset the result index + If loffset is set, offset the result index. This is NOT an idempotent routine, it will be applied exactly once to the result. @@ -380,11 +387,15 @@ def _apply_loffset(self, result): return result def _get_resampler_for_grouping(self, groupby, **kwargs): - """ return the correct class for resampling with groupby """ + """ + Return the correct class for resampling with groupby. + """ return self._resampler_for_grouping(self, groupby=groupby, **kwargs) def _wrap_result(self, result): - """ potentially wrap any results """ + """ + Potentially wrap any results. + """ if isinstance(result, ABCSeries) and self._selection is not None: result.name = self._selection @@ -397,7 +408,7 @@ def _wrap_result(self, result): def pad(self, limit=None): """ - Forward fill the values + Forward fill the values. Parameters ---------- @@ -418,23 +429,63 @@ def pad(self, limit=None): def nearest(self, limit=None): """ - Fill values with nearest neighbor starting from center + Resample by using the nearest value. + + When resampling data, missing values may appear (e.g., when the + resampling frequency is higher than the original frequency). + The `nearest` method will replace ``NaN`` values that appeared in + the resampled data with the value from the nearest member of the + sequence, based on the index value. + Missing values that existed in the original data will not be modified. + If `limit` is given, fill only this many values in each direction for + each of the original values. Parameters ---------- - limit : integer, optional - limit of how many values to fill + limit : int, optional + Limit of how many values to fill. .. versionadded:: 0.21.0 Returns ------- - an upsampled Series + Series or DataFrame + An upsampled Series or DataFrame with ``NaN`` values filled with + their nearest value. See Also -------- - Series.fillna - DataFrame.fillna + backfill : Backward fill the new missing values in the resampled data. + pad : Forward fill ``NaN`` values. + + Examples + -------- + >>> s = pd.Series([1, 2], + ... index=pd.date_range('20180101', + ... periods=2, + ... freq='1h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + Freq: H, dtype: int64 + + >>> s.resample('15min').nearest() + 2018-01-01 00:00:00 1 + 2018-01-01 00:15:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 00:45:00 2 + 2018-01-01 01:00:00 2 + Freq: 15T, dtype: int64 + + Limit the number of upsampled values imputed by the nearest: + + >>> s.resample('15min').nearest(limit=1) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + Freq: 15T, dtype: float64 """ return self._upsample('nearest', limit=limit) @@ -720,12 +771,11 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, def asfreq(self, fill_value=None): """ - return the values at the new freq, - essentially a reindex + Return the values at the new freq, essentially a reindex. Parameters ---------- - fill_value: scalar, optional + fill_value : scalar, optional Value to use for missing values, applied during upsampling (note this does not fill NaNs that already were present). @@ -740,7 +790,7 @@ def asfreq(self, fill_value=None): def std(self, ddof=1, *args, **kwargs): """ - Compute standard deviation of groups, excluding missing values + Compute standard deviation of groups, excluding missing values. Parameters ---------- @@ -752,12 +802,12 @@ def std(self, ddof=1, *args, **kwargs): def var(self, ddof=1, *args, **kwargs): """ - Compute variance of groups, excluding missing values + Compute variance of groups, excluding missing values. Parameters ---------- ddof : integer, default 1 - degrees of freedom + degrees of freedom """ nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) @@ -826,8 +876,10 @@ def f(self, _method=method): def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): - """ potentially we might have a deprecation warning, show it - but call the appropriate methods anyhow """ + """ + Potentially we might have a deprecation warning, show it + but call the appropriate methods anyhow. + """ if how is not None: @@ -872,8 +924,9 @@ def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): class _GroupByMixin(GroupByMixin): - """ provide the groupby facilities """ - + """ + Provide the groupby facilities. + """ def __init__(self, obj, *args, **kwargs): parent = kwargs.pop('parent', None) @@ -894,8 +947,8 @@ def __init__(self, obj, *args, **kwargs): def _apply(self, f, grouper=None, *args, **kwargs): """ - dispatch to _upsample; we are stripping all of the _upsample kwargs and - performing the original function call on the grouped object + Dispatch to _upsample; we are stripping all of the _upsample kwargs and + performing the original function call on the grouped object. """ def func(x): @@ -929,7 +982,7 @@ def _get_binner_for_time(self): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function + Downsample the cython defined function. Parameters ---------- @@ -966,6 +1019,7 @@ def _downsample(self, how, **kwargs): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. + The range of a new index should not be outside specified range """ if self.closed == 'right': @@ -976,6 +1030,8 @@ def _adjust_binner_for_upsample(self, binner): def _upsample(self, method, limit=None, fill_value=None): """ + Parameters + ---------- method : string {'backfill', 'bfill', 'pad', 'ffill', 'asfreq'} method for upsampling limit : int, default None @@ -983,7 +1039,7 @@ def _upsample(self, method, limit=None, fill_value=None): fill_value : scalar, default None Value to use for missing values - See also + See Also -------- .fillna @@ -1028,7 +1084,6 @@ class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): Provides a resample of a groupby implementation .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1069,7 +1124,7 @@ def _convert_obj(self, obj): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function + Downsample the cython defined function. Parameters ---------- @@ -1106,6 +1161,8 @@ def _downsample(self, how, **kwargs): def _upsample(self, method, limit=None, fill_value=None): """ + Parameters + ---------- method : string {'backfill', 'bfill', 'pad', 'ffill'} method for upsampling limit : int, default None @@ -1113,7 +1170,7 @@ def _upsample(self, method, limit=None, fill_value=None): fill_value : scalar, default None Value to use for missing values - See also + See Also -------- .fillna @@ -1140,10 +1197,9 @@ def _upsample(self, method, limit=None, fill_value=None): class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): """ - Provides a resample of a groupby implementation + Provides a resample of a groupby implementation. .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1162,6 +1218,7 @@ def _get_binner_for_time(self): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. + The range of a new index is allowed to be greater than original range so we don't need to change the length of a binner, GH 13022 """ @@ -1170,10 +1227,9 @@ def _adjust_binner_for_upsample(self, binner): class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): """ - Provides a resample of a groupby implementation + Provides a resample of a groupby implementation. .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1181,7 +1237,9 @@ def _constructor(self): def resample(obj, kind=None, **kwds): - """ create a TimeGrouper and return our resampler """ + """ + Create a TimeGrouper and return our resampler. + """ tg = TimeGrouper(**kwds) return tg._get_resampler(obj, kind=kind) @@ -1191,7 +1249,9 @@ def resample(obj, kind=None, **kwds): def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs): - """ return our appropriate resampler when grouping as well """ + """ + Return our appropriate resampler when grouping as well. + """ # .resample uses 'on' similar to how .groupby uses 'key' kwargs['key'] = kwargs.pop('on', None) @@ -1207,7 +1267,7 @@ def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, class TimeGrouper(Grouper): """ - Custom groupby class for time-interval grouping + Custom groupby class for time-interval grouping. Parameters ---------- @@ -1274,7 +1334,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', def _get_resampler(self, obj, kind=None): """ - return my resampler or raise if we have an invalid axis + Return my resampler or raise if we have an invalid axis. Parameters ---------- @@ -1338,11 +1398,11 @@ def _get_time_bins(self, ax): # because replace() will swallow the nanosecond part # thus last bin maybe slightly before the end if the end contains # nanosecond part and lead to `Values falls after last bin` error - binner = labels = DatetimeIndex(freq=self.freq, - start=first, - end=last, - tz=tz, - name=ax.name) + binner = labels = date_range(freq=self.freq, + start=first, + end=last, + tz=tz, + name=ax.name) # GH 15549 # In edge case of tz-aware resapmling binner last index can be @@ -1424,12 +1484,12 @@ def _get_time_delta_bins(self, ax): return binner, [], labels start, end = ax.min(), ax.max() - labels = binner = TimedeltaIndex(start=start, - end=end, - freq=self.freq, - name=ax.name) + labels = binner = timedelta_range(start=start, + end=end, + freq=self.freq, + name=ax.name) - end_stamps = labels + 1 + end_stamps = labels + self.freq bins = ax.searchsorted(end_stamps, side='left') # Addresses GH #10530 @@ -1443,17 +1503,18 @@ def _get_time_period_bins(self, ax): raise TypeError('axis must be a DatetimeIndex, but got ' 'an instance of %r' % type(ax).__name__) + freq = self.freq + if not len(ax): - binner = labels = PeriodIndex( - data=[], freq=self.freq, name=ax.name) + binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name) return binner, [], labels labels = binner = PeriodIndex(start=ax[0], end=ax[-1], - freq=self.freq, + freq=freq, name=ax.name) - end_stamps = (labels + 1).asfreq(self.freq, 's').to_timestamp() + end_stamps = (labels + freq).asfreq(freq, 's').to_timestamp() if ax.tzinfo: end_stamps = end_stamps.tz_localize(ax.tzinfo) bins = ax.searchsorted(end_stamps, side='left') @@ -1605,7 +1666,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): """ - Utility frequency conversion method for Series/DataFrame + Utility frequency conversion method for Series/DataFrame. """ if isinstance(obj.index, PeriodIndex): if method is not None: diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 7ac1c0cb52fe3..3c76eef809c7a 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -1,8 +1,8 @@ # flake8: noqa from pandas.core.reshape.concat import concat -from pandas.core.reshape.melt import melt, lreshape, wide_to_long +from pandas.core.reshape.melt import lreshape, melt, wide_to_long +from pandas.core.reshape.merge import merge, merge_asof, merge_ordered +from pandas.core.reshape.pivot import crosstab, pivot, pivot_table from pandas.core.reshape.reshape import get_dummies -from pandas.core.reshape.merge import merge, merge_ordered, merge_asof -from pandas.core.reshape.pivot import pivot_table, pivot, crosstab from pandas.core.reshape.tile import cut, qcut diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 1c602a0af1ec1..b13b22d2e8266 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,17 +3,19 @@ """ import numpy as np -from pandas import compat, DataFrame, Series, Index, MultiIndex -from pandas.core.index import (_get_objs_combined_axis, - ensure_index, _get_consensus_names, - _all_indexes_same) -from pandas.core.arrays.categorical import (_factorize_from_iterable, - _factorize_from_iterables) -from pandas.core.internals import concatenate_block_managers + +import pandas.core.dtypes.concat as _concat + +from pandas import DataFrame, Index, MultiIndex, Series, compat from pandas.core import common as com -import pandas.core.indexes.base as ibase +from pandas.core.arrays.categorical import ( + _factorize_from_iterable, _factorize_from_iterables) from pandas.core.generic import NDFrame -import pandas.core.dtypes.concat as _concat +from pandas.core.index import ( + _all_indexes_same, _get_consensus_names, _get_objs_combined_axis, + ensure_index) +import pandas.core.indexes.base as ibase +from pandas.core.internals import concatenate_block_managers # --------------------------------------------------------------------- # Concatenate DataFrame objects @@ -175,12 +177,12 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, letter number animal 0 c 3 cat 1 d 4 dog - >>> pd.concat([df1, df3]) - animal letter number - 0 NaN a 1 - 1 NaN b 2 - 0 cat c 3 - 1 dog d 4 + >>> pd.concat([df1, df3], sort=False) + letter number animal + 0 a 1 NaN + 1 b 2 NaN + 0 c 3 cat + 1 d 4 dog Combine ``DataFrame`` objects with overlapping columns and return only those that are shared by passing ``inner`` to @@ -320,7 +322,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, # Standardize axis parameter to int if isinstance(sample, Series): - axis = DataFrame()._get_axis_number(axis) + axis = DataFrame._get_axis_number(axis) else: axis = sample._get_axis_number(axis) @@ -446,7 +448,7 @@ def _get_new_axes(self): new_axes[i] = self._get_comb_axis(i) else: if len(self.join_axes) != ndim - 1: - raise AssertionError("length of join_axes must not be equal " + raise AssertionError("length of join_axes must be equal " "to {length}".format(length=ndim - 1)) # ufff... @@ -500,7 +502,7 @@ def _get_concat_axis(self): else: return ibase.default_index(len(self.objs)) else: - return ensure_index(self.keys) + return ensure_index(self.keys).set_names(self.names) else: indexes = [x._data.axes[self.axis] for x in self.objs] @@ -553,9 +555,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): levels = [ensure_index(x) for x in levels] if not _all_indexes_same(indexes): - label_list = [] + codes_list = [] - # things are potentially different sizes, so compute the exact labels + # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): @@ -568,18 +570,18 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): .format(key=key, level=level)) to_concat.append(np.repeat(i, len(index))) - label_list.append(np.concatenate(to_concat)) + codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) - label_list.extend(concat_index.labels) + codes_list.extend(concat_index.codes) else: codes, categories = _factorize_from_iterable(concat_index) levels.append(categories) - label_list.append(codes) + codes_list.append(codes) if len(names) == len(levels): names = list(names) @@ -592,7 +594,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): # also copies names = names + _get_consensus_names(indexes) - return MultiIndex(levels=levels, labels=label_list, names=names, + return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] @@ -603,8 +605,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): new_names = list(names) new_levels = list(levels) - # construct labels - new_labels = [] + # construct codes + new_codes = [] # do something a bit more speedy @@ -617,17 +619,17 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): raise ValueError('Values not found in passed level: {hlevel!s}' .format(hlevel=hlevel[mask])) - new_labels.append(np.repeat(mapped, n)) + new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) - new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) + new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) - new_labels.append(np.tile(np.arange(n), kpieces)) + new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) - return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 26221143c0cdf..aafc0de64ee12 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -1,21 +1,21 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 +import re + import numpy as np -from pandas.core.dtypes.common import is_list_like -from pandas import compat -from pandas.core.arrays import Categorical +from pandas.util._decorators import Appender +from pandas.core.dtypes.common import is_extension_type, is_list_like from pandas.core.dtypes.generic import ABCMultiIndex +from pandas.core.dtypes.missing import notna +from pandas import compat +from pandas.core.arrays import Categorical from pandas.core.frame import _shared_docs -from pandas.util._decorators import Appender - -import re -from pandas.core.dtypes.missing import notna -from pandas.core.dtypes.common import is_extension_type -from pandas.core.tools.numeric import to_numeric +from pandas.core.indexes.base import Index from pandas.core.reshape.concat import concat +from pandas.core.tools.numeric import to_numeric @Appender(_shared_docs['melt'] % @@ -25,6 +25,12 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): # TODO: what about the existing index? + # If multiindex, gather names of columns on all level for checking presence + # of `id_vars` and `value_vars` + if isinstance(frame.columns, ABCMultiIndex): + cols = [x for c in frame.columns for x in c] + else: + cols = list(frame.columns) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] @@ -33,7 +39,13 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') else: + # Check that `id_vars` are in frame id_vars = list(id_vars) + missing = Index(np.ravel(id_vars)).difference(cols) + if not missing.empty: + raise KeyError("The following 'id_vars' are not present" + " in the DataFrame: {missing}" + "".format(missing=list(missing))) else: id_vars = [] @@ -46,6 +58,12 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, ' columns are a MultiIndex') else: value_vars = list(value_vars) + # Check that `value_vars` are in frame + missing = Index(np.ravel(value_vars)).difference(cols) + if not missing.empty: + raise KeyError("The following 'value_vars' are not present in" + " the DataFrame: {missing}" + "".format(missing=list(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() @@ -430,9 +448,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - melted = [] - for s, v in zip(stubnames, value_vars): - melted.append(melt_stub(df, s, i, j, v, sep)) + melted = [melt_stub(df, s, i, j, v, sep) + for s, v in zip(stubnames, value_vars)] melted = melted[0].join(melted[1:], how='outer') if len(i) == 1: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d0c7b66978661..c0c016f9a8caa 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -3,50 +3,34 @@ """ import copy -import warnings import string +import warnings import numpy as np -from pandas.compat import range, lzip, zip, map, filter -import pandas.compat as compat -from pandas import (Categorical, DataFrame, - Index, MultiIndex, Timedelta, Series) -from pandas.core.arrays.categorical import _recode_for_categories -from pandas.core.frame import _merge_doc -from pandas.core.dtypes.common import ( - is_datetime64tz_dtype, - is_datetime64_dtype, - needs_i8_conversion, - is_int64_dtype, - is_array_like, - is_categorical_dtype, - is_integer_dtype, - is_float_dtype, - is_number, - is_numeric_dtype, - is_integer, - is_int_or_datetime_dtype, - is_dtype_equal, - is_bool, - is_bool_dtype, - is_list_like, - is_datetimelike, - ensure_int64, - ensure_float64, - ensure_object, - _get_dtype) -from pandas.core.dtypes.missing import na_value_for_dtype -from pandas.core.internals import (items_overlap_with_suffix, - concatenate_block_managers) +from pandas._libs import hashtable as libhashtable, join as libjoin, lib +import pandas.compat as compat +from pandas.compat import filter, lzip, map, range, zip +from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution -from pandas.core.sorting import is_int64_overflow_possible +from pandas.core.dtypes.common import ( + ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool, + is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype, + is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, + is_numeric_dtype, needs_i8_conversion) +from pandas.core.dtypes.missing import isnull, na_value_for_dtype + +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta import pandas.core.algorithms as algos -import pandas.core.sorting as sorting +from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com -from pandas._libs import hashtable as libhashtable, join as libjoin, lib -from pandas.errors import MergeError +from pandas.core.frame import _merge_doc +from pandas.core.internals import ( + concatenate_block_managers, items_overlap_with_suffix) +import pandas.core.sorting as sorting +from pandas.core.sorting import is_int64_overflow_possible @Substitution('\nleft : DataFrame') @@ -215,11 +199,10 @@ def merge_ordered(left, right, on=None, The output type will the be same as 'left', if it is a subclass of DataFrame. - See also + See Also -------- merge merge_asof - """ def _merger(x, y): # perform the ordered merge operation @@ -328,7 +311,6 @@ def merge_asof(left, right, on=None, .. versionadded:: 0.20.0 - Returns ------- merged : DataFrame @@ -463,11 +445,10 @@ def merge_asof(left, right, on=None, 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN - See also + See Also -------- merge merge_ordered - """ op = _AsOfMerge(left, right, on=on, left_on=left_on, right_on=right_on, @@ -734,6 +715,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): result[name] = key_col elif result._is_level_reference(name): if isinstance(result.index, MultiIndex): + key_col.name = name idx_list = [result.index.get_level_values(level_name) if level_name != name else key_col for level_name in result.index.names] @@ -875,9 +857,9 @@ def _get_merge_keys(self): left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): - right_keys = [lev._values.take(lab) - for lev, lab in zip(self.right.index.levels, - self.right.index.labels)] + right_keys = [lev._values.take(lev_codes) for lev, lev_codes + in zip(self.right.index.levels, + self.right.index.codes)] else: right_keys = [self.right.index.values] elif _any(self.right_on): @@ -889,9 +871,9 @@ def _get_merge_keys(self): right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): - left_keys = [lev._values.take(lab) - for lev, lab in zip(self.left.index.levels, - self.left.index.labels)] + left_keys = [lev._values.take(lev_codes) for lev, lev_codes + in zip(self.left.index.levels, + self.left.index.codes)] else: left_keys = [self.left.index.values] @@ -1138,6 +1120,95 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', return join_func(lkey, rkey, count, **kwargs) +def _restore_dropped_levels_multijoin(left, right, dropped_level_names, + join_index, lindexer, rindexer): + """ + *this is an internal non-public method* + + Returns the levels, labels and names of a multi-index to multi-index join. + Depending on the type of join, this method restores the appropriate + dropped levels of the joined multi-index. + The method relies on lidx, rindexer which hold the index positions of + left and right, where a join was feasible + + Parameters + ---------- + left : MultiIndex + left index + right : MultiIndex + right index + dropped_level_names : str array + list of non-common level names + join_index : MultiIndex + the index of the join between the + common levels of left and right + lindexer : intp array + left indexer + rindexer : intp array + right indexer + + Returns + ------- + levels : list of Index + levels of combined multiindexes + labels : intp array + labels of combined multiindexes + names : str array + names of combined multiindexes + + """ + + def _convert_to_mulitindex(index): + if isinstance(index, MultiIndex): + return index + else: + return MultiIndex.from_arrays([index.values], + names=[index.name]) + + # For multi-multi joins with one overlapping level, + # the returned index if of type Index + # Assure that join_index is of type MultiIndex + # so that dropped levels can be appended + join_index = _convert_to_mulitindex(join_index) + + join_levels = join_index.levels + join_codes = join_index.codes + join_names = join_index.names + + # lindexer and rindexer hold the indexes where the join occurred + # for left and right respectively. If left/right is None then + # the join occurred on all indices of left/right + if lindexer is None: + lindexer = range(left.size) + + if rindexer is None: + rindexer = range(right.size) + + # Iterate through the levels that must be restored + for dropped_level_name in dropped_level_names: + if dropped_level_name in left.names: + idx = left + indexer = lindexer + else: + idx = right + indexer = rindexer + + # The index of the level name to be restored + name_idx = idx.names.index(dropped_level_name) + + restore_levels = idx.levels[name_idx] + # Inject -1 in the codes list where a join was not possible + # IOW indexer[i]=-1 + codes = idx.codes[name_idx] + restore_codes = algos.take_nd(codes, indexer, fill_value=-1) + + join_levels = join_levels + [restore_levels] + join_codes = join_codes + [restore_codes] + join_names = join_names + [dropped_level_name] + + return join_levels, join_codes, join_names + + class _OrderedMerge(_MergeOperation): _merge_type = 'ordered_merge' @@ -1190,14 +1261,13 @@ def get_result(self): return result -def _asof_function(direction, on_type): - name = 'asof_join_{dir}_{on}'.format(dir=direction, on=on_type) +def _asof_function(direction): + name = 'asof_join_{dir}'.format(dir=direction) return getattr(libjoin, name, None) -def _asof_by_function(direction, on_type, by_type): - name = 'asof_join_{dir}_{on}_by_{by}'.format( - dir=direction, on=on_type, by=by_type) +def _asof_by_function(direction): + name = 'asof_join_{dir}_on_X_by_Y'.format(dir=direction) return getattr(libjoin, name, None) @@ -1207,29 +1277,6 @@ def _asof_by_function(direction, on_type, by_type): 'object': ensure_object, } -_cython_types = { - 'uint8': 'uint8_t', - 'uint32': 'uint32_t', - 'uint16': 'uint16_t', - 'uint64': 'uint64_t', - 'int8': 'int8_t', - 'int32': 'int32_t', - 'int16': 'int16_t', - 'int64': 'int64_t', - 'float16': 'error', - 'float32': 'float', - 'float64': 'double', -} - - -def _get_cython_type(dtype): - """ Given a dtype, return a C name like 'int64_t' or 'double' """ - type_name = _get_dtype(dtype).name - ctype = _cython_types.get(type_name, 'object') - if ctype == 'error': - raise MergeError('unsupported type: {type}'.format(type=type_name)) - return ctype - def _get_cython_type_upcast(dtype): """ Upcast a dtype to 'int64_t', 'double', or 'object' """ @@ -1390,12 +1437,21 @@ def flip(xs): self.right_join_keys[-1]) tolerance = self.tolerance - # we required sortedness in the join keys - msg = "{side} keys must be sorted" + # we require sortedness and non-null values in the join keys + msg_sorted = "{side} keys must be sorted" + msg_missings = "Merge keys contain null values on {side} side" + if not Index(left_values).is_monotonic: - raise ValueError(msg.format(side='left')) + if isnull(left_values).any(): + raise ValueError(msg_missings.format(side='left')) + else: + raise ValueError(msg_sorted.format(side='left')) + if not Index(right_values).is_monotonic: - raise ValueError(msg.format(side='right')) + if isnull(right_values).any(): + raise ValueError(msg_missings.format(side='right')) + else: + raise ValueError(msg_sorted.format(side='right')) # initial type conversion as needed if needs_i8_conversion(left_values): @@ -1429,8 +1485,7 @@ def flip(xs): right_by_values = by_type_caster(right_by_values) # choose appropriate function by type - on_type = _get_cython_type(left_values.dtype) - func = _asof_by_function(self.direction, on_type, by_type) + func = _asof_by_function(self.direction) return func(left_values, right_values, left_by_values, @@ -1439,8 +1494,7 @@ def flip(xs): tolerance) else: # choose appropriate function by type - on_type = _get_cython_type(left_values.dtype) - func = _asof_function(self.direction, on_type) + func = _asof_function(self.direction) return func(left_values, right_values, self.allow_exact_matches, @@ -1454,27 +1508,29 @@ def _get_multiindex_indexer(join_keys, index, sort): fkeys = partial(_factorize_keys, sort=sort) # left & right join labels and num. of levels at each location - rlab, llab, shape = map(list, zip(* map(fkeys, index.levels, join_keys))) + rcodes, lcodes, shape = map(list, zip(* map(fkeys, + index.levels, + join_keys))) if sort: - rlab = list(map(np.take, rlab, index.labels)) + rcodes = list(map(np.take, rcodes, index.codes)) else: i8copy = lambda a: a.astype('i8', subok=False, copy=True) - rlab = list(map(i8copy, index.labels)) + rcodes = list(map(i8copy, index.codes)) # fix right labels if there were any nulls for i in range(len(join_keys)): - mask = index.labels[i] == -1 + mask = index.codes[i] == -1 if mask.any(): # check if there already was any nulls at this location # if there was, it is factorized to `shape[i] - 1` - a = join_keys[i][llab[i] == shape[i] - 1] + a = join_keys[i][lcodes[i] == shape[i] - 1] if a.size == 0 or not a[0] != a[0]: shape[i] += 1 - rlab[i][mask] = shape[i] - 1 + rcodes[i][mask] = shape[i] - 1 # get flat i8 join keys - lkey, rkey = _get_join_keys(llab, rlab, shape, sort) + lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) # factorize keys to a dense i8 space lkey, rkey, count = fkeys(lkey, rkey) @@ -1551,7 +1607,15 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): + elif is_integer_dtype(lk) and is_integer_dtype(rk): + # GH#23917 TODO: needs tests for case where lk is integer-dtype + # and rk is datetime-dtype + klass = libhashtable.Int64Factorizer + lk = ensure_int64(com.values_from_object(lk)) + rk = ensure_int64(com.values_from_object(rk)) + elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and + issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))): + # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer lk = ensure_int64(com.values_from_object(lk)) rk = ensure_int64(com.values_from_object(rk)) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b525dddeb1ba5..84faab017163f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,28 +1,25 @@ # pylint: disable=E1103 +import numpy as np +from pandas.compat import lrange, range, zip +from pandas.util._decorators import Appender, Substitution -from pandas.core.dtypes.common import ( - is_list_like, is_scalar, is_integer_dtype) -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.reshape.concat import concat -from pandas.core.series import Series -from pandas.core.groupby import Grouper -from pandas.core.reshape.util import cartesian_product -from pandas.core.index import Index, MultiIndex, _get_objs_combined_axis -from pandas.compat import range, lrange, zip from pandas import compat import pandas.core.common as com -from pandas.util._decorators import Appender, Substitution - from pandas.core.frame import _shared_docs -# Note: We need to make sure `frame` is imported before `pivot`, otherwise -# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency - -import numpy as np +from pandas.core.groupby import Grouper +from pandas.core.index import Index, MultiIndex, _get_objs_combined_axis +from pandas.core.reshape.concat import concat +from pandas.core.reshape.util import cartesian_product +from pandas.core.series import Series +# Note: We need to make sure `frame` is imported before `pivot`, otherwise +# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency @Substitution('\ndata : DataFrame') @Appender(_shared_docs['pivot_table'], indents=1) def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', @@ -140,8 +137,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', margins_name=margins_name, fill_value=fill_value) # discard the top level - if values_passed and not values_multi and not table.empty and \ - (table.columns.nlevels > 1): + if (values_passed and not values_multi and not table.empty and + (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: @@ -410,12 +407,12 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. - aggfunc : function, optional - If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed + aggfunc : function, optional + If specified, requires `values` be specified as well margins : boolean, default False Add row/column margins (subtotals) margins_name : string, default 'All' @@ -436,7 +433,6 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, .. versionadded:: 0.18.1 - Notes ----- Any Series passed will have their name attributes used unless row or column diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 50f6e310705d7..ba86d3d9ba25f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,35 +1,31 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -from pandas.compat import range, text_type, zip, u, PY2 -from pandas import compat from functools import partial import itertools import numpy as np -from pandas.core.dtypes.common import ( - ensure_platform_int, - is_list_like, is_bool_dtype, - needs_i8_conversion, is_sparse, is_object_dtype) -from pandas.core.dtypes.cast import maybe_promote -from pandas.core.dtypes.missing import notna - -from pandas.core.series import Series -from pandas.core.frame import DataFrame - -from pandas.core.sparse.api import SparseDataFrame, SparseSeries -from pandas.core.sparse.array import SparseArray +from pandas._libs import algos as _algos, reshape as _reshape from pandas._libs.sparse import IntIndex +from pandas.compat import PY2, range, text_type, u, zip -from pandas.core.arrays import Categorical -from pandas.core.arrays.categorical import _factorize_from_iterable -from pandas.core.sorting import (get_group_index, get_compressed_ids, - compress_group_index, decons_obs_group_ids) +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like, + is_object_dtype, needs_i8_conversion) +from pandas.core.dtypes.missing import notna +from pandas import compat import pandas.core.algorithms as algos -from pandas._libs import algos as _algos, reshape as _reshape - +from pandas.core.arrays import SparseArray +from pandas.core.arrays.categorical import _factorize_from_iterable +from pandas.core.frame import DataFrame from pandas.core.index import Index, MultiIndex +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, decons_obs_group_ids, get_compressed_ids, + get_group_index) +from pandas.core.sparse.api import SparseDataFrame, SparseSeries class _Unstacker(object): @@ -86,28 +82,15 @@ class _Unstacker(object): def __init__(self, values, index, level=-1, value_columns=None, fill_value=None, constructor=None): - self.is_categorical = None - self.is_sparse = is_sparse(values) if values.ndim == 1: - if isinstance(values, Categorical): - self.is_categorical = values - values = np.array(values) - elif self.is_sparse: - # XXX: Makes SparseArray *dense*, but it's supposedly - # a single column at a time, so it's "doable" - values = values.values values = values[:, np.newaxis] self.values = values self.value_columns = value_columns self.fill_value = fill_value if constructor is None: - if self.is_sparse: - self.constructor = SparseDataFrame - else: - self.constructor = DataFrame - else: - self.constructor = constructor + constructor = DataFrame + self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') @@ -117,7 +100,7 @@ def __init__(self, values, index, level=-1, value_columns=None, self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 - self.lift = 1 if -1 in self.index.labels[self.level] else 0 + self.lift = 1 if -1 in self.index.codes[self.level] else 0 self.new_index_levels = list(self.index.levels) self.new_index_names = list(self.index.names) @@ -132,9 +115,9 @@ def __init__(self, values, index, level=-1, value_columns=None, def _make_sorted_values_labels(self): v = self.level - labs = list(self.index.labels) + codes = list(self.index.codes) levs = list(self.index.levels) - to_sort = labs[:v] + labs[v + 1:] + [labs[v]] + to_sort = codes[:v] + codes[v + 1:] + [codes[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) @@ -178,14 +161,6 @@ def get_result(self): columns = self.get_new_columns() index = self.get_new_index() - # may need to coerce categoricals here - if self.is_categorical is not None: - categories = self.is_categorical.categories - ordered = self.is_categorical.ordered - values = [Categorical(values[:, i], categories=categories, - ordered=ordered) - for i in range(values.shape[-1])] - return self.constructor(values, index=index, columns=columns) def get_new_values(self): @@ -268,16 +243,16 @@ def get_new_columns(self): new_levels = self.value_columns.levels + (self.removed_level_full,) new_names = self.value_columns.names + (self.removed_name,) - new_labels = [lab.take(propagator) - for lab in self.value_columns.labels] + new_codes = [lab.take(propagator) + for lab in self.value_columns.codes] else: new_levels = [self.value_columns, self.removed_level_full] new_names = [self.value_columns.name, self.removed_name] - new_labels = [propagator] + new_codes = [propagator] # The two indices differ only if the unstacked level had unused items: if len(self.removed_level_full) != len(self.removed_level): - # In this case, we remap the new labels to the original level: + # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: repeater = np.insert(repeater, 0, -1) @@ -286,22 +261,22 @@ def get_new_columns(self): repeater = np.arange(stride) - self.lift # The entire level is then just a repetition of the single chunk: - new_labels.append(np.tile(repeater, width)) - return MultiIndex(levels=new_levels, labels=new_labels, + new_codes.append(np.tile(repeater, width)) + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) def get_new_index(self): - result_labels = [lab.take(self.compressor) - for lab in self.sorted_labels[:-1]] + result_codes = [lab.take(self.compressor) + for lab in self.sorted_labels[:-1]] # construct the new index if len(self.new_index_levels) == 1: - lev, lab = self.new_index_levels[0], result_labels[0] + lev, lab = self.new_index_levels[0], result_codes[0] if (lab == -1).any(): lev = lev.insert(len(lev), lev._na_value) return lev.take(lab) - return MultiIndex(levels=self.new_index_levels, labels=result_labels, + return MultiIndex(levels=self.new_index_levels, codes=result_codes, names=self.new_index_names, verify_integrity=False) @@ -318,35 +293,36 @@ def _unstack_multiple(data, clocs, fill_value=None): rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] - clabels = [index.labels[i] for i in clocs] + ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] - rlabels = [index.labels[i] for i in rlocs] + rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] - group_index = get_group_index(clabels, shape, sort=False, xnull=False) + group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) - recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, - xnull=False) + recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, + xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name='__placeholder__') else: dummy_index = MultiIndex(levels=rlevels + [obs_ids], - labels=rlabels + [comp_ids], + codes=rcodes + [comp_ids], names=rnames + ['__placeholder__'], verify_integrity=False) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index + unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) new_levels = clevels new_names = cnames - new_labels = recons_labels + new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data @@ -368,11 +344,11 @@ def _unstack_multiple(data, clocs, fill_value=None): new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames - new_labels = [unstcols.labels[0]] - for rec in recons_labels: - new_labels.append(rec.take(unstcols.labels[-1])) + new_codes = [unstcols.codes[0]] + for rec in recons_codes: + new_codes.append(rec.take(unstcols.codes[-1])) - new_columns = MultiIndex(levels=new_levels, labels=new_labels, + new_columns = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): @@ -398,6 +374,8 @@ def unstack(obj, level, fill_value=None): else: return obj.T.stack(dropna=False) else: + if is_extension_array_dtype(obj.dtype): + return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker(obj.values, obj.index, level=level, fill_value=fill_value, constructor=obj._constructor_expanddim) @@ -408,7 +386,8 @@ def _unstack_frame(obj, level, fill_value=None): if obj._is_mixed_type: unstacker = partial(_Unstacker, index=obj.index, level=level, fill_value=fill_value) - blocks = obj._data.unstack(unstacker) + blocks = obj._data.unstack(unstacker, + fill_value=fill_value) return obj._constructor(blocks) else: unstacker = _Unstacker(obj.values, obj.index, level=level, @@ -418,6 +397,52 @@ def _unstack_frame(obj, level, fill_value=None): return unstacker.get_result() +def _unstack_extension_series(series, level, fill_value): + """ + Unstack an ExtensionArray-backed Series. + + The ExtensionDtype is preserved. + + Parameters + ---------- + series : Series + A Series with an ExtensionArray for values + level : Any + The level name or number. + fill_value : Any + The user-level (not physical storage) fill value to use for + missing values introduced by the reshape. Passed to + ``series.values.take``. + + Returns + ------- + DataFrame + Each column of the DataFrame will have the same dtype as + the input Series. + """ + # Implementation note: the basic idea is to + # 1. Do a regular unstack on a dummy array of integers + # 2. Followup with a columnwise take. + # We use the dummy take to discover newly-created missing values + # introduced by the reshape. + from pandas.core.reshape.concat import concat + + dummy_arr = np.arange(len(series)) + # fill_value=-1, since we will do a series.values.take later + result = _Unstacker(dummy_arr, series.index, + level=level, fill_value=-1).get_result() + + out = [] + values = series.values + + for col, indices in result.iteritems(): + out.append(Series(values.take(indices.values, + allow_fill=True, + fill_value=fill_value), + name=col, index=result.index)) + return concat(out, axis='columns', copy=False, keys=result.columns) + + def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the @@ -427,7 +452,6 @@ def stack(frame, level=-1, dropna=True): ------- stacked : Series """ - def factorize(index): if index.is_unique: return index, np.arange(len(index)) @@ -443,25 +467,44 @@ def factorize(index): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) - new_labels = [lab.repeat(K) for lab in frame.index.labels] + new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) - new_labels.append(np.tile(clab, N).ravel()) + new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) - new_index = MultiIndex(levels=new_levels, labels=new_labels, + new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) - labels = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex(levels=levels, labels=labels, + codes = ilab.repeat(K), np.tile(clab, N).ravel() + new_index = MultiIndex(levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False) - new_values = frame.values.ravel() + if frame._is_homogeneous_type: + # For homogeneous EAs, frame.values will coerce to object. So + # we concatenate instead. + dtypes = list(frame.dtypes.values) + dtype = dtypes[0] + + if is_extension_array_dtype(dtype): + arr = dtype.construct_array_type() + new_values = arr._concat_same_type([ + col._values for _, col in frame.iteritems() + ]) + new_values = _reorder_for_extension_array_stack(new_values, N, K) + else: + # homogeneous, non-EA + new_values = frame.values.ravel() + + else: + # non-homogeneous + new_values = frame.values.ravel() + if dropna: mask = notna(new_values) new_values = new_values[mask] @@ -549,9 +592,9 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list(zip(*[lev.take(lab) - for lev, lab in zip(this.columns.levels[:-1], - this.columns.labels[:-1])])) + tuples = list(zip(*[lev.take(level_codes) for lev, level_codes + in zip(this.columns.levels[:-1], + this.columns.codes[:-1])])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) @@ -561,9 +604,9 @@ def _convert_level_number(level_num, columns): # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] - level_labels = sorted(set(this.columns.labels[-1])) - level_vals_used = level_vals[level_labels] - levsize = len(level_labels) + level_codes = sorted(set(this.columns.codes[-1])) + level_vals_used = level_vals[level_codes] + levsize = len(level_codes) drop_cols = [] for key in unique_groups: try: @@ -583,15 +626,31 @@ def _convert_level_number(level_num, columns): if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] - chunk.columns = level_vals.take(chunk.columns.labels[-1]) + chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: - if frame._is_mixed_type: - value_slice = this.loc[:, this.columns[loc]].values + if (frame._is_homogeneous_type and + is_extension_array_dtype(frame.dtypes.iloc[0])): + dtype = this[this.columns[loc]].dtypes.iloc[0] + subset = this[this.columns[loc]] + + value_slice = dtype.construct_array_type()._concat_same_type( + [x._values for _, x in subset.iteritems()] + ) + N, K = this.shape + idx = np.arange(N * K).reshape(K, N).T.ravel() + value_slice = value_slice.take(idx) + + elif frame._is_mixed_type: + value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] - new_data[key] = value_slice.ravel() + if value_slice.ndim > 1: + # i.e. not extension + value_slice = value_slice.ravel() + + new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) @@ -601,17 +660,17 @@ def _convert_level_number(level_num, columns): if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) - new_labels = [lab.repeat(levsize) for lab in this.index.labels] + new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: new_levels = [this.index] - new_labels = [np.arange(N).repeat(levsize)] + new_codes = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) - new_labels.append(np.tile(level_labels, N)) + new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) - new_index = MultiIndex(levels=new_levels, labels=new_labels, + new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) result = frame._constructor(new_data, index=new_index, columns=new_columns) @@ -745,9 +804,8 @@ def check_len(item, name): if is_list_like(item): if not len(item) == data_to_encode.shape[1]: - len_msg = \ - len_msg.format(name=name, len_item=len(item), - len_enc=data_to_encode.shape[1]) + len_msg = len_msg.format(name=name, len_item=len(item), + len_enc=data_to_encode.shape[1]) raise ValueError(len_msg) check_len(prefix, 'prefix') @@ -921,12 +979,47 @@ def make_axis_dummies(frame, axis='minor', transform=None): num = numbers.get(axis, axis) items = frame.index.levels[num] - labels = frame.index.labels[num] + codes = frame.index.codes[num] if transform is not None: mapped_items = items.map(transform) - labels, items = _factorize_from_iterable(mapped_items.take(labels)) + codes, items = _factorize_from_iterable(mapped_items.take(codes)) values = np.eye(len(items), dtype=float) - values = values.take(labels, axis=0) + values = values.take(codes, axis=0) return DataFrame(values, columns=items, index=frame.index) + + +def _reorder_for_extension_array_stack(arr, n_rows, n_columns): + """ + Re-orders the values when stacking multiple extension-arrays. + + The indirect stacking method used for EAs requires a followup + take to get the order correct. + + Parameters + ---------- + arr : ExtensionArray + n_rows, n_columns : int + The number of rows and columns in the original DataFrame. + + Returns + ------- + taken : ExtensionArray + The original `arr` with elements re-ordered appropriately + + Examples + -------- + >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) + >>> _reorder_for_extension_array_stack(arr, 2, 3) + array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='>> _reorder_for_extension_array_stack(arr, 3, 2) + array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='>> pd.Series([1, 2, 3]).values @@ -460,28 +466,33 @@ def values(self): array(['2013-01-01T05:00:00.000000000', '2013-01-02T05:00:00.000000000', '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') - """ return self._data.external_values() @property def _values(self): - """ return the internal repr of this data """ + """ + Return the internal repr of this data. + """ return self._data.internal_values() def _formatting_values(self): - """Return the values that can be formatted (used by SeriesFormatter - and DataFrameFormatter) + """ + Return the values that can be formatted (used by SeriesFormatter + and DataFrameFormatter). """ return self._data.formatting_values() def get_values(self): - """ same as values (but handles sparseness conversions); is a view """ + """ + Same as values (but handles sparseness conversions); is a view. + """ return self._data.get_values() @property def asobject(self): - """Return object Series which contains boxed values. + """ + Return object Series which contains boxed values. .. deprecated :: 0.23.0 @@ -496,9 +507,9 @@ def asobject(self): # ops def ravel(self, order='C'): """ - Return the flattened underlying data as an ndarray + Return the flattened underlying data as an ndarray. - See also + See Also -------- numpy.ndarray.ravel """ @@ -506,11 +517,11 @@ def ravel(self, order='C'): def compress(self, condition, *args, **kwargs): """ - Return selected slices of an array along given axis as a Series + Return selected slices of an array along given axis as a Series. .. deprecated:: 0.24.0 - See also + See Also -------- numpy.ndarray.compress """ @@ -523,7 +534,7 @@ def compress(self, condition, *args, **kwargs): def nonzero(self): """ - Return the *integer* indices of the elements that are non-zero + Return the *integer* indices of the elements that are non-zero. This method is equivalent to calling `numpy.nonzero` on the series data. For compatibility with NumPy, the return value is @@ -558,10 +569,9 @@ def nonzero(self): def put(self, *args, **kwargs): """ - Applies the `put` method to its `values` attribute - if it has one. + Applies the `put` method to its `values` attribute if it has one. - See also + See Also -------- numpy.ndarray.put """ @@ -569,7 +579,7 @@ def put(self, *args, **kwargs): def __len__(self): """ - return the length of the Series + Return the length of the Series. """ return len(self._data) @@ -642,26 +652,31 @@ def view(self, dtype=None): return self._constructor(self._values.view(dtype), index=self.index).__finalize__(self) + # ---------------------------------------------------------------------- + # NDArray Compat + def __array__(self, result=None): """ - the array interface, return my values + The array interface, return my values. """ return self.get_values() def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc + Gets called after a ufunc. """ return self._constructor(result, index=self.index, copy=False).__finalize__(self) def __array_prepare__(self, result, context=None): """ - Gets called prior to a ufunc + Gets called prior to a ufunc. """ # nice error message for non-ufunc types - if context is not None and not isinstance(self._values, np.ndarray): + if (context is not None and + (not isinstance(self._values, (np.ndarray, ExtensionArray)) + or isinstance(self._values, Categorical))): obj = context[1][0] raise TypeError("{obj} with dtype {dtype} cannot perform " "the numpy op {op}".format( @@ -670,9 +685,14 @@ def __array_prepare__(self, result, context=None): op=context[0].__name__)) return result - # complex + # ---------------------------------------------------------------------- + # Unary Methods + @property def real(self): + """ + Return the real value of vector. + """ return self.values.real @real.setter @@ -681,6 +701,9 @@ def real(self, v): @property def imag(self): + """ + Return imag value of vector. + """ return self.values.imag @imag.setter @@ -692,6 +715,8 @@ def imag(self, v): __long__ = _coerce_method(int) __int__ = _coerce_method(int) + # ---------------------------------------------------------------------- + def _unpickle_series_compat(self, state): if isinstance(state, dict): self._data = state['_data'] @@ -724,12 +749,14 @@ def _unpickle_series_compat(self, state): # indexers @property def axes(self): - """Return a list of the row axis labels""" + """ + Return a list of the row axis labels. + """ return [self.index] def _ixs(self, i, axis=0): """ - Return the i-th value or values in the Series by location + Return the i-th value or values in the Series by location. Parameters ---------- @@ -968,7 +995,9 @@ def _set_with(self, key, value): except Exception: pass - if not isinstance(key, (list, Series, np.ndarray, Series)): + if is_scalar(key): + key = [key] + elif not isinstance(key, (list, Series, np.ndarray)): try: key = list(key) except Exception: @@ -1011,7 +1040,7 @@ def repeat(self, repeats, *args, **kwargs): Repeat elements of an Series. Refer to `numpy.ndarray.repeat` for more information about the `repeats` argument. - See also + See Also -------- numpy.ndarray.repeat """ @@ -1022,7 +1051,8 @@ def repeat(self, repeats, *args, **kwargs): index=new_index).__finalize__(self) def get_value(self, label, takeable=False): - """Quickly retrieve single value at passed index label + """ + Quickly retrieve single value at passed index label. .. deprecated:: 0.21.0 Please use .at[] or .iat[] accessors. @@ -1049,9 +1079,11 @@ def _get_value(self, label, takeable=False): _get_value.__doc__ = get_value.__doc__ def set_value(self, label, value, takeable=False): - """Quickly set single value at passed label. If label is not contained, - a new object is created with the label placed at the end of the result - index. + """ + Quickly set single value at passed label. + + If label is not contained, a new object is created with the label + placed at the end of the result index. .. deprecated:: 0.21.0 Please use .at[] or .iat[] accessors. @@ -1222,9 +1254,12 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): df = self.to_frame(name) return df.reset_index(level=level, drop=drop) + # ---------------------------------------------------------------------- + # Rendering Methods + def __unicode__(self): """ - Return a string representation for a particular DataFrame + Return a string representation for a particular DataFrame. Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. @@ -1245,7 +1280,7 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, index=True, length=False, dtype=False, name=False, max_rows=None): """ - Render a string representation of the Series + Render a string representation of the Series. Parameters ---------- @@ -1256,7 +1291,7 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, float_format : one-parameter function, optional formatter function to apply to columns' elements if they are floats default None - header: boolean, default True + header : boolean, default True Add the Series header (index name) index : bool, optional Add index (row) labels, default True @@ -1297,9 +1332,11 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, with open(buf, 'w') as f: f.write(result) + # ---------------------------------------------------------------------- + def iteritems(self): """ - Lazily iterate over (index, value) tuples + Lazily iterate over (index, value) tuples. """ return zip(iter(self.index), iter(self)) @@ -1309,7 +1346,9 @@ def iteritems(self): # Misc public methods def keys(self): - """Alias for index""" + """ + Alias for index. + """ return self.index def to_dict(self, into=dict): @@ -1348,7 +1387,7 @@ def to_dict(self, into=dict): def to_frame(self, name=None): """ - Convert Series to DataFrame + Convert Series to DataFrame. Parameters ---------- @@ -1369,7 +1408,7 @@ def to_frame(self, name=None): def to_sparse(self, kind='block', fill_value=None): """ - Convert Series to SparseSeries + Convert Series to SparseSeries. Parameters ---------- @@ -1380,9 +1419,13 @@ def to_sparse(self, kind='block', fill_value=None): ------- sp : SparseSeries """ + # TODO: deprecate from pandas.core.sparse.series import SparseSeries - return SparseSeries(self, kind=kind, - fill_value=fill_value).__finalize__(self) + + values = SparseArray(self, kind=kind, fill_value=fill_value) + return SparseSeries( + values, index=self.index, name=self.name + ).__finalize__(self) def _set_name(self, name, inplace=False): """ @@ -1406,7 +1449,7 @@ def _set_name(self, name, inplace=False): def count(self, level=None): """ - Return number of non-NA/null observations in the Series + Return number of non-NA/null observations in the Series. Parameters ---------- @@ -1425,20 +1468,21 @@ def count(self, level=None): level = self.index._get_level_number(level) lev = self.index.levels[level] - lab = np.array(self.index.labels[level], subok=False, copy=True) + level_codes = np.array(self.index.codes[level], subok=False, copy=True) - mask = lab == -1 + mask = level_codes == -1 if mask.any(): - lab[mask] = cnt = len(lev) + level_codes[mask] = cnt = len(lev) lev = lev.insert(cnt, lev._na_value) - obs = lab[notna(self.values)] + obs = level_codes[notna(self.values)] out = np.bincount(obs, minlength=len(lev) or None) return self._constructor(out, index=lev, dtype='int64').__finalize__(self) def mode(self, dropna=True): - """Return the mode(s) of the dataset. + """ + Return the mode(s) of the dataset. Always returns Series even if only one value is returned. @@ -1471,8 +1515,8 @@ def unique(self): See Also -------- - pandas.unique : top-level unique method for any 1-d array-like object. - Index.unique : return Index with unique values from an Index object. + unique : Top-level unique method for any 1-d array-like object. + Index.unique : Return Index with unique values from an Index object. Examples -------- @@ -1531,9 +1575,9 @@ def drop_duplicates(self, keep='first', inplace=False): See Also -------- - Index.drop_duplicates : equivalent method on Index - DataFrame.drop_duplicates : equivalent method on DataFrame - Series.duplicated : related method on Series, indicating duplicate + Index.drop_duplicates : Equivalent method on Index. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Series.duplicated : Related method on Series, indicating duplicate Series values. Examples @@ -1653,9 +1697,9 @@ def duplicated(self, keep='first'): See Also -------- - pandas.Index.duplicated : Equivalent method on pandas.Index - pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame - pandas.Series.drop_duplicates : Remove duplicate values from Series + Index.duplicated : Equivalent method on pandas.Index. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + Series.drop_duplicates : Remove duplicate values from Series. """ return super(Series, self).duplicated(keep=keep) @@ -1801,7 +1845,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): # ndarray compat argmin = deprecate( 'argmin', idxmin, '0.21.0', - msg=dedent("""\ + msg=dedent(""" The current behaviour of 'Series.argmin' is deprecated, use 'idxmin' instead. The behavior of 'argmin' will be corrected to return the positional @@ -1811,7 +1855,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): ) argmax = deprecate( 'argmax', idxmax, '0.21.0', - msg=dedent("""\ + msg=dedent(""" The current behaviour of 'Series.argmax' is deprecated, use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional @@ -1839,7 +1883,6 @@ def round(self, decimals=0, *args, **kwargs): -------- numpy.around DataFrame.round - """ nv.validate_round(args, kwargs) result = com.values_from_object(self).round(decimals) @@ -1887,7 +1930,7 @@ def quantile(self, q=0.5, interpolation='linear'): See Also -------- - pandas.core.window.Rolling.quantile + core.window.Rolling.quantile numpy.percentile """ @@ -1905,7 +1948,7 @@ def quantile(self, q=0.5, interpolation='linear'): def corr(self, other, method='pearson', min_periods=None): """ - Compute correlation with `other` Series, excluding missing values + Compute correlation with `other` Series, excluding missing values. Parameters ---------- @@ -1921,14 +1964,12 @@ def corr(self, other, method='pearson', min_periods=None): min_periods : int, optional Minimum number of observations needed to have a valid result - Returns ------- correlation : float Examples -------- - >>> import numpy as np >>> histogram_intersection = lambda a, b: np.minimum(a, b ... ).sum().round(decimals=1) >>> s1 = pd.Series([.2, .0, .6, .2]) @@ -1950,7 +1991,7 @@ def corr(self, other, method='pearson', min_periods=None): def cov(self, other, min_periods=None): """ - Compute covariance with Series, excluding missing values + Compute covariance with Series, excluding missing values. Parameters ---------- @@ -1992,7 +2033,7 @@ def diff(self, periods=1): Series.pct_change: Percent change over given number of periods. Series.shift: Shift index by desired number of periods with an optional time freq. - DataFrame.diff: First discrete difference of object + DataFrame.diff: First discrete difference of object. Examples -------- @@ -2080,16 +2121,53 @@ def autocorr(self, lag=1): def dot(self, other): """ - Matrix multiplication with DataFrame or inner-product with Series - objects. Can also be called using `self @ other` in Python >= 3.5. + Compute the dot product between the Series and the columns of other. + + This method computes the dot product between the Series and another + one, or the Series and each columns of a DataFrame, or the Series and + each columns of an array. + + It can also be called using `self @ other` in Python >= 3.5. Parameters ---------- - other : Series or DataFrame + other : Series, DataFrame or array-like + The other object to compute the dot product with its columns. Returns ------- - dot_product : scalar or Series + scalar, Series or numpy.ndarray + Return the dot product of the Series and other if other is a + Series, the Series of the dot product of Series and each rows of + other if other is a DataFrame or a numpy.ndarray between the Series + and each columns of the numpy array. + + See Also + -------- + DataFrame.dot: Compute the matrix product with the DataFrame. + Series.mul: Multiplication of series and other, element-wise. + + Notes + ----- + The Series and other has to share the same index if other is a Series + or a DataFrame. + + Examples + -------- + >>> s = pd.Series([0, 1, 2, 3]) + >>> other = pd.Series([-1, 2, -3, 4]) + >>> s.dot(other) + 8 + >>> s @ other + 8 + >>> df = pd.DataFrame([[0 ,1], [-2, 3], [4, -5], [6, 7]]) + >>> s.dot(df) + 0 24 + 1 14 + dtype: int64 + >>> arr = np.array([[0, 1], [-2, 3], [4, -5], [6, 7]]) + >>> s.dot(arr) + array([24, 14]) """ from pandas.core.frame import DataFrame if isinstance(other, (Series, DataFrame)): @@ -2120,11 +2198,15 @@ def dot(self, other): raise TypeError('unsupported type: %s' % type(other)) def __matmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.dot(other) def __rmatmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.dot(np.transpose(other)) @Substitution(klass='Series') @@ -2160,10 +2242,10 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): list and then concatenate the list with the original Series all at once. - See also + See Also -------- - pandas.concat : General function to concatenate DataFrame, Series - or Panel objects + concat : General function to concatenate DataFrame, Series + or Panel objects. Returns ------- @@ -2209,8 +2291,6 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): Traceback (most recent call last): ... ValueError: Indexes have overlapping values: [0, 1, 2] - - """ from pandas.core.reshape.concat import concat @@ -2223,7 +2303,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): def _binop(self, other, func, level=None, fill_value=None): """ - Perform generic binary operation with optional fill value + Perform generic binary operation with optional fill value. Parameters ---------- @@ -2266,36 +2346,70 @@ def _binop(self, other, func, level=None, fill_value=None): def combine(self, other, func, fill_value=None): """ - Perform elementwise binary operation on two Series using given function - with optional fill value when an index is missing from one Series or - the other + Combine the Series with a Series or scalar according to `func`. + + Combine the Series and `other` using `func` to perform elementwise + selection for combined Series. + `fill_value` is assumed when value is missing at some index + from one of the two objects being combined. Parameters ---------- - other : Series or scalar value + other : Series or scalar + The value(s) to be combined with the `Series`. func : function - Function that takes two scalars as inputs and return a scalar - fill_value : scalar value - The default specifies to use the appropriate NaN value for - the underlying dtype of the Series + Function that takes two scalars as inputs and returns an element. + fill_value : scalar, optional + The value to assume when an index is missing from + one Series or the other. The default specifies to use the + appropriate NaN value for the underlying dtype of the Series. Returns ------- - result : Series - - Examples - -------- - >>> s1 = pd.Series([1, 2]) - >>> s2 = pd.Series([0, 3]) - >>> s1.combine(s2, lambda x1, x2: x1 if x1 < x2 else x2) - 0 0 - 1 2 - dtype: int64 + Series + The result of combining the Series with the other object. See Also -------- Series.combine_first : Combine Series values, choosing the calling - Series's values first + Series' values first. + + Examples + -------- + Consider 2 Datasets ``s1`` and ``s2`` containing + highest clocked speeds of different birds. + + >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) + >>> s1 + falcon 330.0 + eagle 160.0 + dtype: float64 + >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + >>> s2 + falcon 345.0 + eagle 200.0 + duck 30.0 + dtype: float64 + + Now, to combine the two datasets and view the highest speeds + of the birds across the two datasets + + >>> s1.combine(s2, max) + duck NaN + eagle 200.0 + falcon 345.0 + dtype: float64 + + In the previous example, the resulting value for duck is missing, + because the maximum of a NaN and a float is a NaN. + So, in the example, we set ``fill_value=0``, + so the maximum value returned will be the value from some dataset. + + >>> s1.combine(s2, max, fill_value=0) + duck 30.0 + eagle 200.0 + falcon 345.0 + dtype: float64 """ if fill_value is None: fill_value = na_value_for_dtype(self.dtype, compat=False) @@ -2337,16 +2451,26 @@ def combine(self, other, func, fill_value=None): def combine_first(self, other): """ - Combine Series values, choosing the calling Series's values - first. Result index will be the union of the two indexes + Combine Series values, choosing the calling Series's values first. Parameters ---------- other : Series + The value(s) to be combined with the `Series`. Returns ------- - combined : Series + Series + The result of combining the Series with the other object. + + See Also + -------- + Series.combine : Perform elementwise operation on two Series + using a given function. + + Notes + ----- + Result index will be the union of the two indexes. Examples -------- @@ -2356,11 +2480,6 @@ def combine_first(self, other): 0 1.0 1 4.0 dtype: float64 - - See Also - -------- - Series.combine : Perform elementwise operation on two Series - using a given function """ new_index = self.index.union(other.index) this = self.reindex(new_index, copy=False) @@ -2373,7 +2492,7 @@ def combine_first(self, other): def update(self, other): """ Modify Series in place using non-NA values from passed - Series. Aligns on index + Series. Aligns on index. Parameters ---------- @@ -2415,7 +2534,6 @@ def update(self, other): 1 2 2 6 dtype: int64 - """ other = other.reindex_like(self) mask = notna(other) @@ -2628,9 +2746,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, See Also -------- - DataFrame.sort_index: Sort DataFrame by the index - DataFrame.sort_values: Sort DataFrame by the value - Series.sort_values : Sort Series by the value + DataFrame.sort_index: Sort DataFrame by the index. + DataFrame.sort_values: Sort DataFrame by the value. + Series.sort_values : Sort Series by the value. Examples -------- @@ -2716,7 +2834,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer labels = index._sort_levels_monotonic() - indexer = lexsort_indexer(labels._get_labels_for_sorting(), + indexer = lexsort_indexer(labels._get_codes_for_sorting(), orders=ascending, na_position=na_position) else: @@ -2749,7 +2867,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, def argsort(self, axis=0, kind='quicksort', order=None): """ Overrides ndarray.argsort. Argsorts the value, omitting NA/null values, - and places the result in the same locations as the non-NA values + and places the result in the same locations as the non-NA values. Parameters ---------- @@ -2763,7 +2881,7 @@ def argsort(self, axis=0, kind='quicksort', order=None): ------- argsorted : Series, with -1 indicated where nan values are present - See also + See Also -------- numpy.ndarray.argsort """ @@ -2973,36 +3091,9 @@ def nsmallest(self, n=5, keep='first'): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() - def sortlevel(self, level=0, ascending=True, sort_remaining=True): - """Sort Series with MultiIndex by chosen level. Data will be - lexicographically sorted by the chosen level followed by the other - levels (in order), - - .. deprecated:: 0.20.0 - Use :meth:`Series.sort_index` - - Parameters - ---------- - level : int or level name, default None - ascending : bool, default True - - Returns - ------- - sorted : Series - - See Also - -------- - Series.sort_index(level=...) - - """ - warnings.warn("sortlevel is deprecated, use sort_index(level=...)", - FutureWarning, stacklevel=2) - return self.sort_index(level=level, ascending=ascending, - sort_remaining=sort_remaining) - def swaplevel(self, i=-2, j=-1, copy=True): """ - Swap levels i and j in a MultiIndex + Swap levels i and j in a MultiIndex. Parameters ---------- @@ -3017,7 +3108,6 @@ def swaplevel(self, i=-2, j=-1, copy=True): The indexes ``i`` and ``j`` are now optional, and default to the two innermost levels of the index. - """ new_index = self.index.swaplevel(i, j) return self._constructor(self._values, index=new_index, @@ -3025,12 +3115,13 @@ def swaplevel(self, i=-2, j=-1, copy=True): def reorder_levels(self, order): """ - Rearrange index levels using input order. May not drop or duplicate - levels + Rearrange index levels using input order. + + May not drop or duplicate levels. Parameters ---------- - order : list of int representing new level order. + order : list of int representing new level order (reference level by number or key) Returns @@ -3171,8 +3262,7 @@ def map(self, arg, na_action=None): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -3204,10 +3294,10 @@ def _gotitem(self, key, ndim, subset=None): max 4 dtype: int64 - See also + See Also -------- - pandas.Series.apply : Invoke function on a Series. - pandas.Series.transform : Transform function producing + Series.apply : Invoke function on a Series. + Series.transform : Transform function producing a Series with like indexes. """) @@ -3251,38 +3341,41 @@ def transform(self, func, axis=0, *args, **kwargs): def apply(self, func, convert_dtype=True, args=(), **kwds): """ - Invoke function on values of Series. Can be ufunc (a NumPy function - that applies to the entire Series) or a Python function that only works - on single values + Invoke function on values of Series. + + Can be ufunc (a NumPy function that applies to the entire Series) + or a Python function that only works on single values. Parameters ---------- func : function - convert_dtype : boolean, default True + Python function or NumPy ufunc to apply. + convert_dtype : bool, default True Try to find better dtype for elementwise function results. If - False, leave as dtype=object + False, leave as dtype=object. args : tuple - Positional arguments to pass to function in addition to the value - Additional keyword arguments will be passed as keywords to the function + Positional arguments passed to func after the series value. + **kwds + Additional keyword arguments passed to func. Returns ------- - y : Series or DataFrame if func returns a Series + Series or DataFrame + If func returns a Series object the result will be a DataFrame. - See also + See Also -------- - Series.map: For element-wise operations - Series.agg: only perform aggregating type operations - Series.transform: only perform transforming type operations + Series.map: For element-wise operations. + Series.agg: Only perform aggregating type operations. + Series.transform: Only perform transforming type operations. Examples -------- - Create a series with typical summer temperatures for each city. - >>> series = pd.Series([20, 21, 12], index=['London', - ... 'New York','Helsinki']) - >>> series + >>> s = pd.Series([20, 21, 12], + ... index=['London', 'New York', 'Helsinki']) + >>> s London 20 New York 21 Helsinki 12 @@ -3292,8 +3385,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): argument to ``apply()``. >>> def square(x): - ... return x**2 - >>> series.apply(square) + ... return x ** 2 + >>> s.apply(square) London 400 New York 441 Helsinki 144 @@ -3302,7 +3395,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): Square the values by passing an anonymous function as an argument to ``apply()``. - >>> series.apply(lambda x: x**2) + >>> s.apply(lambda x: x ** 2) London 400 New York 441 Helsinki 144 @@ -3313,9 +3406,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): ``args`` keyword. >>> def subtract_custom_value(x, custom_value): - ... return x-custom_value + ... return x - custom_value - >>> series.apply(subtract_custom_value, args=(5,)) + >>> s.apply(subtract_custom_value, args=(5,)) London 15 New York 16 Helsinki 7 @@ -3326,10 +3419,10 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): >>> def add_custom_values(x, **kwargs): ... for month in kwargs: - ... x+=kwargs[month] + ... x += kwargs[month] ... return x - >>> series.apply(add_custom_values, june=30, july=20, august=25) + >>> s.apply(add_custom_values, june=30, july=20, august=25) London 95 New York 96 Helsinki 87 @@ -3337,13 +3430,11 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): Use a function from the Numpy library. - >>> series.apply(np.log) + >>> s.apply(np.log) London 2.995732 New York 3.044522 Helsinki 2.484907 dtype: float64 - - """ if len(self) == 0: return self._constructor(dtype=self.dtype, @@ -3385,23 +3476,31 @@ def f(x): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): """ - perform a reduction operation - - if we have an ndarray as a value, then simply perform the operation, - otherwise delegate to the object + Perform a reduction operation. + If we have an ndarray as a value, then simply perform the operation, + otherwise delegate to the object. """ delegate = self._values - if isinstance(delegate, np.ndarray): - # Validate that 'axis' is consistent with Series's single axis. - if axis is not None: - self._get_axis_number(axis) + + if axis is not None: + self._get_axis_number(axis) + + # dispatch to ExtensionArray interface + if isinstance(delegate, ExtensionArray): + return delegate._reduce(name, skipna=skipna, **kwds) + + # dispatch to numpy arrays + elif isinstance(delegate, np.ndarray): if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) with np.errstate(all='ignore'): return op(delegate, skipna=skipna, **kwds) + # TODO(EA) dispatch to Index + # remove once all internals extension types are + # moved to ExtensionArrays return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only, filter_type=filter_type, **kwds) @@ -3417,8 +3516,9 @@ def _reindex_indexer(self, new_index, indexer, copy): return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): - """ check if we do need a multi reindex; this is for compat with - higher dims + """ + Check if we do need a multi reindex; this is for compat with + higher dims. """ return False @@ -3433,7 +3533,8 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, broadcast_axis=broadcast_axis) def rename(self, index=None, **kwargs): - """Alter Series index labels or name + """ + Alter Series index labels or name. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don't throw an @@ -3450,9 +3551,9 @@ def rename(self, index=None, **kwargs): the index. Scalar or hashable sequence-like will alter the ``Series.name`` attribute. - copy : boolean, default True + copy : bool, default True Also copy underlying data - inplace : boolean, default False + inplace : bool, default False Whether to return a new Series. If True then value of copy is ignored. level : int or level name, default None @@ -3465,7 +3566,7 @@ def rename(self, index=None, **kwargs): See Also -------- - pandas.Series.rename_axis + Series.rename_axis Examples -------- @@ -3567,8 +3668,8 @@ def drop(self, labels=None, axis=0, index=None, columns=None, >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], - ... labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], ... index=midx) >>> s @@ -3617,7 +3718,8 @@ def shift(self, periods=1, freq=None, axis=0): return super(Series, self).shift(periods=periods, freq=freq, axis=axis) def reindex_axis(self, labels, axis=0, **kwargs): - """Conform Series to new index with optional filling logic. + """ + Conform Series to new index with optional filling logic. .. deprecated:: 0.21.0 Use ``Series.reindex`` instead. @@ -3739,7 +3841,7 @@ def isin(self, values): See Also -------- - pandas.DataFrame.isin : equivalent method on DataFrame + DataFrame.isin : Equivalent method on DataFrame. Examples -------- @@ -3797,8 +3899,8 @@ def between(self, left, right, inclusive=True): See Also -------- - pandas.Series.gt : Greater than of series and other - pandas.Series.lt : Less than of series and other + Series.gt : Greater than of series and other. + Series.lt : Less than of series and other. Examples -------- @@ -3846,7 +3948,8 @@ def between(self, left, right, inclusive=True): @classmethod def from_csv(cls, path, sep=',', parse_dates=True, header=None, index_col=0, encoding=None, infer_datetime_format=False): - """Read CSV file. + """ + Read CSV file. .. deprecated:: 0.21.0 Use :func:`pandas.read_csv` instead. @@ -3883,14 +3986,14 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, encoding : string, optional a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 - infer_datetime_format: boolean, default False + infer_datetime_format : boolean, default False If True and `parse_dates` is True for a column, try to infer the datetime format based on the first datetime string. If the format can be inferred, there often will be a large parsing speed-up. - See also + See Also -------- - pandas.read_csv + read_csv Returns ------- @@ -3967,19 +4070,6 @@ def to_csv(self, *args, **kwargs): kwargs["header"] = False # Backwards compatibility. return self.to_frame().to_csv(**kwargs) - @Appender(generic._shared_docs['to_excel'] % _shared_doc_kwargs) - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True): - df = self.to_frame() - df.to_excel(excel_writer=excel_writer, sheet_name=sheet_name, - na_rep=na_rep, float_format=float_format, columns=columns, - header=header, index=index, index_label=index_label, - startrow=startrow, startcol=startcol, engine=engine, - merge_cells=merge_cells, encoding=encoding, - inf_rep=inf_rep, verbose=verbose) - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) def isna(self): return super(Series, self).isna() @@ -4089,7 +4179,8 @@ def dropna(self, axis=0, inplace=False, **kwargs): return self.copy() def valid(self, inplace=False, **kwargs): - """Return Series without null values. + """ + Return Series without null values. .. deprecated:: 0.23.0 Use :meth:`Series.dropna` instead. @@ -4103,7 +4194,7 @@ def valid(self, inplace=False, **kwargs): def to_timestamp(self, freq=None, how='start', copy=True): """ - Cast to datetimeindex of timestamps, at *beginning* of period + Cast to datetimeindex of timestamps, at *beginning* of period. Parameters ---------- @@ -4128,7 +4219,7 @@ def to_timestamp(self, freq=None, how='start', copy=True): def to_period(self, freq=None, copy=True): """ Convert Series from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed) + frequency (inferred from index if not passed). Parameters ---------- @@ -4153,6 +4244,7 @@ def to_period(self, freq=None, copy=True): dt = CachedAccessor("dt", CombinedDatetimelikeProperties) cat = CachedAccessor("cat", CategoricalAccessor) plot = CachedAccessor("plot", gfx.SeriesPlotMethods) + sparse = CachedAccessor("sparse", SparseAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series @@ -4168,198 +4260,3 @@ def to_period(self, freq=None, copy=True): # Add arithmetic! ops.add_flex_arithmetic_methods(Series) ops.add_special_arithmetic_methods(Series) - - -# ----------------------------------------------------------------------------- -# Supplementary functions - - -def _sanitize_index(data, index, copy=False): - """ sanitize an index type to return an ndarray of the underlying, pass - thru a non-Index - """ - - if index is None: - return data - - if len(data) != len(index): - raise ValueError('Length of values does not match length of ' 'index') - - if isinstance(data, ABCIndexClass) and not copy: - pass - elif isinstance(data, (PeriodIndex, DatetimeIndex)): - data = data._values - if copy: - data = data.copy() - - elif isinstance(data, np.ndarray): - - # coerce datetimelike types - if data.dtype.kind in ['M', 'm']: - data = _sanitize_array(data, index, copy=copy) - - return data - - -def _sanitize_array(data, index, dtype=None, copy=False, - raise_cast_failure=False): - """ sanitize input data to an ndarray, copy if specified, coerce to the - dtype if specified - """ - - if dtype is not None: - dtype = pandas_dtype(dtype) - - if isinstance(data, ma.MaskedArray): - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data[mask] = fill_value - else: - data = data.copy() - - def _try_cast(arr, take_fast_path): - - # perf shortcut as this is the most common case - if take_fast_path: - if maybe_castable(arr) and not copy and dtype is None: - return arr - - try: - # gh-15832: Check if we are requesting a numeric dype and - # that we can convert the data to the requested dtype. - if is_integer_dtype(dtype): - subarr = maybe_cast_to_integer_array(arr, dtype) - - subarr = maybe_cast_to_datetime(arr, dtype) - # Take care in creating object arrays (but iterators are not - # supported): - if is_object_dtype(dtype) and (is_list_like(subarr) and - not (is_iterator(subarr) or - isinstance(subarr, np.ndarray))): - subarr = construct_1d_object_array_from_listlike(subarr) - elif not is_extension_type(subarr): - subarr = construct_1d_ndarray_preserving_na(subarr, dtype, - copy=copy) - except (ValueError, TypeError): - if is_categorical_dtype(dtype): - # We *do* allow casting to categorical, since we know - # that Categorical is the only array type for 'category'. - subarr = Categorical(arr, dtype.categories, - ordered=dtype.ordered) - elif is_extension_array_dtype(dtype): - # create an extension array from its dtype - array_type = dtype.construct_array_type()._from_sequence - subarr = array_type(subarr, dtype=dtype, copy=copy) - - elif dtype is not None and raise_cast_failure: - raise - else: - subarr = np.array(arr, dtype=object, copy=copy) - return subarr - - # GH #846 - if isinstance(data, (np.ndarray, Index, Series)): - - if dtype is not None: - subarr = np.array(data, copy=False) - - # possibility of nan -> garbage - if is_float_dtype(data.dtype) and is_integer_dtype(dtype): - if not isna(data).any(): - subarr = _try_cast(data, True) - elif copy: - subarr = data.copy() - else: - subarr = _try_cast(data, True) - elif isinstance(data, Index): - # don't coerce Index types - # e.g. indexes can have different conversions (so don't fast path - # them) - # GH 6140 - subarr = _sanitize_index(data, index, copy=copy) - else: - - # we will try to copy be-definition here - subarr = _try_cast(data, True) - - elif isinstance(data, ExtensionArray): - subarr = data - - if dtype is not None and not data.dtype.is_dtype(dtype): - subarr = data.astype(dtype) - - if copy: - subarr = data.copy() - return subarr - - elif isinstance(data, (list, tuple)) and len(data) > 0: - if dtype is not None: - try: - subarr = _try_cast(data, False) - except Exception: - if raise_cast_failure: # pragma: no cover - raise - subarr = np.array(data, dtype=object, copy=copy) - subarr = lib.maybe_convert_objects(subarr) - - else: - subarr = maybe_convert_platform(data) - - subarr = maybe_cast_to_datetime(subarr, dtype) - - elif isinstance(data, range): - # GH 16804 - start, stop, step = get_range_parameters(data) - arr = np.arange(start, stop, step, dtype='int64') - subarr = _try_cast(arr, False) - else: - subarr = _try_cast(data, False) - - # scalar like, GH - if getattr(subarr, 'ndim', 0) == 0: - if isinstance(data, list): # pragma: no cover - subarr = np.array(data, dtype=object) - elif index is not None: - value = data - - # figure out the dtype from the value (upcast if necessary) - if dtype is None: - dtype, value = infer_dtype_from_scalar(value) - else: - # need to possibly convert the value here - value = maybe_cast_to_datetime(value, dtype) - - subarr = construct_1d_arraylike_from_scalar( - value, len(index), dtype) - - else: - return subarr.item() - - # the result that we want - elif subarr.ndim == 1: - if index is not None: - - # a 1-element ndarray - if len(subarr) != len(index) and len(subarr) == 1: - subarr = construct_1d_arraylike_from_scalar( - subarr[0], len(index), subarr.dtype) - - elif subarr.ndim > 1: - if isinstance(data, np.ndarray): - raise Exception('Data must be 1-dimensional') - else: - subarr = com.asarray_tuplesafe(data, dtype=dtype) - - # This is to prevent mixed-type Series getting all casted to - # NumPy string type, e.g. NaN --> '-1#IND'. - if issubclass(subarr.dtype.type, compat.string_types): - # GH 16605 - # If not empty convert the data to dtype - # GH 19853: If data is a scalar, subarr has already the result - if not is_scalar(data): - if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - subarr = np.array(data, dtype=object, copy=copy) - - return subarr diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5aa9ea658482b..b34dfddcc66e1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,18 +1,17 @@ """ miscellaneous sorting / groupby utilities """ import numpy as np -from pandas.compat import long, string_types, PY3 -from pandas.core.dtypes.common import ( - ensure_platform_int, - ensure_int64, - is_list_like, - is_categorical_dtype) + +from pandas._libs import algos, hashtable, lib +from pandas._libs.hashtable import unique_label_indices +from pandas.compat import PY3, long, string_types + from pandas.core.dtypes.cast import infer_dtype_from_array +from pandas.core.dtypes.common import ( + ensure_int64, ensure_platform_int, is_categorical_dtype, is_list_like) from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algorithms -from pandas._libs import lib, algos, hashtable -from pandas._libs.hashtable import unique_label_indices +import pandas.core.algorithms as algorithms _INT64_MAX = np.iinfo(np.int64).max @@ -241,7 +240,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): # specially handle Categorical if is_categorical_dtype(items): - return items.argsort(ascending=ascending, kind=kind) + if na_position not in {'first', 'last'}: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + + mask = isna(items) + cnt_null = mask.sum() + sorted_idx = items.argsort(ascending=ascending, kind=kind) + if ascending and na_position == 'last': + # NaN is coded as -1 and is listed in front after sorting + sorted_idx = np.roll(sorted_idx, -cnt_null) + elif not ascending and na_position == 'first': + # NaN is coded as -1 and is listed in the end after sorting + sorted_idx = np.roll(sorted_idx, cnt_null) + return sorted_idx items = np.asanyarray(items) idx = np.arange(len(items)) diff --git a/pandas/core/sparse/api.py b/pandas/core/sparse/api.py index 85941e6923338..33e8b921905ba 100644 --- a/pandas/core/sparse/api.py +++ b/pandas/core/sparse/api.py @@ -1,5 +1,5 @@ # pylint: disable=W0611 # flake8: noqa -from pandas.core.sparse.array import SparseArray -from pandas.core.sparse.series import SparseSeries +from pandas.core.arrays.sparse import SparseArray, SparseDtype from pandas.core.sparse.frame import SparseDataFrame +from pandas.core.sparse.series import SparseSeries diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py deleted file mode 100644 index 186a2490a5f2e..0000000000000 --- a/pandas/core/sparse/array.py +++ /dev/null @@ -1,852 +0,0 @@ -""" -SparseArray data structure -""" -from __future__ import division -# pylint: disable=E1101,E1103,W0231 - -import numpy as np -import warnings - -import pandas as pd -from pandas.core.base import PandasObject, IndexOpsMixin - -from pandas import compat -from pandas.compat import range, PYPY -from pandas.compat.numpy import function as nv - -from pandas.core.dtypes.generic import ABCSparseSeries -from pandas.core.dtypes.common import ( - ensure_platform_int, - is_float, is_integer, - is_object_dtype, - is_integer_dtype, - is_bool_dtype, - is_list_like, - is_string_dtype, - is_scalar, is_dtype_equal) -from pandas.core.dtypes.cast import ( - maybe_convert_platform, maybe_promote, - astype_nansafe, find_common_type, infer_dtype_from_scalar, - construct_1d_arraylike_from_scalar) -from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype - -import pandas._libs.sparse as splib -import pandas._libs.lib as lib -from pandas._libs.sparse import SparseIndex, BlockIndex, IntIndex -from pandas._libs import index as libindex -import pandas.core.algorithms as algos -import pandas.core.ops as ops -import pandas.io.formats.printing as printing -from pandas.util._decorators import Appender -from pandas.core.indexes.base import _index_shared_docs - - -_sparray_doc_kwargs = dict(klass='SparseArray') - - -def _get_fill(arr): - # coerce fill_value to arr dtype if possible - # int64 SparseArray can have NaN as fill_value if there is no missing - try: - return np.asarray(arr.fill_value, dtype=arr.dtype) - except ValueError: - return np.asarray(arr.fill_value) - - -def _sparse_array_op(left, right, op, name): - if name.startswith('__'): - # For lookups in _libs.sparse we need non-dunder op name - name = name[2:-2] - - # dtype used to find corresponding sparse method - if not is_dtype_equal(left.dtype, right.dtype): - dtype = find_common_type([left.dtype, right.dtype]) - left = left.astype(dtype) - right = right.astype(dtype) - else: - dtype = left.dtype - - # dtype the result must have - result_dtype = None - - if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: - with np.errstate(all='ignore'): - result = op(left.get_values(), right.get_values()) - fill = op(_get_fill(left), _get_fill(right)) - - if left.sp_index.ngaps == 0: - index = left.sp_index - else: - index = right.sp_index - elif left.sp_index.equals(right.sp_index): - with np.errstate(all='ignore'): - result = op(left.sp_values, right.sp_values) - fill = op(_get_fill(left), _get_fill(right)) - index = left.sp_index - else: - if name[0] == 'r': - left, right = right, left - name = name[1:] - - if name in ('and', 'or') and dtype == 'bool': - opname = 'sparse_{name}_uint8'.format(name=name) - # to make template simple, cast here - left_sp_values = left.sp_values.view(np.uint8) - right_sp_values = right.sp_values.view(np.uint8) - result_dtype = np.bool - else: - opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) - left_sp_values = left.sp_values - right_sp_values = right.sp_values - - sparse_op = getattr(splib, opname) - with np.errstate(all='ignore'): - result, index, fill = sparse_op(left_sp_values, left.sp_index, - left.fill_value, right_sp_values, - right.sp_index, right.fill_value) - - if result_dtype is None: - result_dtype = result.dtype - - return _wrap_result(name, result, index, fill, dtype=result_dtype) - - -def _wrap_result(name, data, sparse_index, fill_value, dtype=None): - """ wrap op result to have correct dtype """ - if name.startswith('__'): - # e.g. __eq__ --> eq - name = name[2:-2] - - if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): - dtype = np.bool - - if is_bool_dtype(dtype): - # fill_value may be np.bool_ - fill_value = bool(fill_value) - return SparseArray(data, sparse_index=sparse_index, - fill_value=fill_value, dtype=dtype) - - -class SparseArray(PandasObject, np.ndarray): - """Data structure for labeled, sparse floating point 1-D data - - Parameters - ---------- - data : {array-like (1-D), Series, SparseSeries, dict} - kind : {'block', 'integer'} - fill_value : float - Code for missing value. Defaults depends on dtype. - 0 for int dtype, False for bool dtype, and NaN for other dtypes - sparse_index : {BlockIndex, IntIndex}, optional - Only if you have one. Mainly used internally - - Notes - ----- - SparseArray objects are immutable via the typical Python means. If you - must change values, convert to dense, make your changes, then convert back - to sparse - """ - __array_priority__ = 15 - _typ = 'array' - _subtyp = 'sparse_array' - - sp_index = None - fill_value = None - - def __new__(cls, data, sparse_index=None, index=None, kind='integer', - fill_value=None, dtype=None, copy=False): - - if index is not None: - if data is None: - data = np.nan - if not is_scalar(data): - raise Exception("must only pass scalars with an index ") - dtype = infer_dtype_from_scalar(data)[0] - data = construct_1d_arraylike_from_scalar( - data, len(index), dtype) - - if isinstance(data, ABCSparseSeries): - data = data.values - is_sparse_array = isinstance(data, SparseArray) - - if dtype is not None: - dtype = np.dtype(dtype) - - if is_sparse_array: - sparse_index = data.sp_index - values = data.sp_values - fill_value = data.fill_value - else: - # array-like - if sparse_index is None: - if dtype is not None: - data = np.asarray(data, dtype=dtype) - res = make_sparse(data, kind=kind, fill_value=fill_value) - values, sparse_index, fill_value = res - else: - values = _sanitize_values(data) - if len(values) != sparse_index.npoints: - raise AssertionError("Non array-like type {type} must " - "have the same length as the index" - .format(type=type(values))) - # Create array, do *not* copy data by default - if copy: - subarr = np.array(values, dtype=dtype, copy=True) - else: - subarr = np.asarray(values, dtype=dtype) - # Change the class of the array to be the subclass type. - return cls._simple_new(subarr, sparse_index, fill_value) - - @classmethod - def _simple_new(cls, data, sp_index, fill_value): - if not isinstance(sp_index, SparseIndex): - # caller must pass SparseIndex - raise ValueError('sp_index must be a SparseIndex') - - if fill_value is None: - if sp_index.ngaps > 0: - # has missing hole - fill_value = np.nan - else: - fill_value = na_value_for_dtype(data.dtype) - - if (is_integer_dtype(data) and is_float(fill_value) and - sp_index.ngaps > 0): - # if float fill_value is being included in dense repr, - # convert values to float - data = data.astype(float) - - result = data.view(cls) - - if not isinstance(sp_index, SparseIndex): - # caller must pass SparseIndex - raise ValueError('sp_index must be a SparseIndex') - - result.sp_index = sp_index - result._fill_value = fill_value - return result - - @property - def _constructor(self): - return lambda x: SparseArray(x, fill_value=self.fill_value, - kind=self.kind) - - @property - def kind(self): - if isinstance(self.sp_index, BlockIndex): - return 'block' - elif isinstance(self.sp_index, IntIndex): - return 'integer' - - @Appender(IndexOpsMixin.memory_usage.__doc__) - def memory_usage(self, deep=False): - values = self.sp_values - - v = values.nbytes - - if deep and is_object_dtype(self) and not PYPY: - v += lib.memory_usage_of_objects(values) - - return v - - def __array_wrap__(self, out_arr, context=None): - """ - NumPy calls this method when ufunc is applied - - Parameters - ---------- - - out_arr : ndarray - ufunc result (note that ufunc is only applied to sp_values) - context : tuple of 3 elements (ufunc, signature, domain) - for example, following is a context when np.sin is applied to - SparseArray, - - (, (SparseArray,), 0)) - - See http://docs.scipy.org/doc/numpy/user/basics.subclassing.html - """ - if isinstance(context, tuple) and len(context) == 3: - ufunc, args, domain = context - # to apply ufunc only to fill_value (to avoid recursive call) - args = [getattr(a, 'fill_value', a) for a in args] - with np.errstate(all='ignore'): - fill_value = ufunc(self.fill_value, *args[1:]) - else: - fill_value = self.fill_value - - return self._simple_new(out_arr, sp_index=self.sp_index, - fill_value=fill_value) - - def __array_finalize__(self, obj): - """ - Gets called after any ufunc or other array operations, necessary - to pass on the index. - """ - self.sp_index = getattr(obj, 'sp_index', None) - self._fill_value = getattr(obj, 'fill_value', None) - - def __reduce__(self): - """Necessary for making this object picklable""" - object_state = list(np.ndarray.__reduce__(self)) - subclass_state = self.fill_value, self.sp_index - object_state[2] = self.sp_values.__reduce__()[2] - object_state[2] = (object_state[2], subclass_state) - return tuple(object_state) - - def __setstate__(self, state): - """Necessary for making this object picklable""" - nd_state, own_state = state - np.ndarray.__setstate__(self, nd_state) - - fill_value, sp_index = own_state[:2] - self.sp_index = sp_index - self._fill_value = fill_value - - def __len__(self): - try: - return self.sp_index.length - except AttributeError: - return 0 - - def __unicode__(self): - return '{self}\nFill: {fill}\n{index}'.format( - self=printing.pprint_thing(self), - fill=printing.pprint_thing(self.fill_value), - index=printing.pprint_thing(self.sp_index)) - - def disable(self, other): - raise NotImplementedError('inplace binary ops not supported') - # Inplace operators - __iadd__ = disable - __isub__ = disable - __imul__ = disable - __itruediv__ = disable - __ifloordiv__ = disable - __ipow__ = disable - - # Python 2 division operators - if not compat.PY3: - __idiv__ = disable - - @property - def values(self): - """ - Dense values - """ - output = np.empty(len(self), dtype=self.dtype) - int_index = self.sp_index.to_int_index() - output.fill(self.fill_value) - output.put(int_index.indices, self) - return output - - @property - def shape(self): - return (len(self),) - - @property - def sp_values(self): - # caching not an option, leaks memory - return self.view(np.ndarray) - - @property - def fill_value(self): - return self._fill_value - - @fill_value.setter - def fill_value(self, value): - if not is_scalar(value): - raise ValueError('fill_value must be a scalar') - # if the specified value triggers type promotion, raise ValueError - new_dtype, fill_value = maybe_promote(self.dtype, value) - if is_dtype_equal(self.dtype, new_dtype): - self._fill_value = fill_value - else: - msg = 'unable to set fill_value {fill} to {dtype} dtype' - raise ValueError(msg.format(fill=value, dtype=self.dtype)) - - def get_values(self, fill=None): - """ return a dense representation """ - return self.to_dense(fill=fill) - - def to_dense(self, fill=None): - """ - Convert SparseArray to a NumPy array. - - Parameters - ---------- - fill: float, default None - .. deprecated:: 0.20.0 - This argument is not respected by this function. - - Returns - ------- - arr : NumPy array - """ - if fill is not None: - warnings.warn(("The 'fill' parameter has been deprecated and " - "will be removed in a future version."), - FutureWarning, stacklevel=2) - return self.values - - def __iter__(self): - if np.issubdtype(self.dtype, np.floating): - boxer = float - elif np.issubdtype(self.dtype, np.integer): - boxer = int - else: - boxer = lambda x: x - - for i in range(len(self)): - r = self._get_val_at(i) - - # box em - yield boxer(r) - - def __getitem__(self, key): - """ - - """ - - if is_integer(key): - return self._get_val_at(key) - elif isinstance(key, tuple): - data_slice = self.values[key] - else: - if isinstance(key, SparseArray): - if is_bool_dtype(key): - key = key.to_dense() - else: - key = np.asarray(key) - - if hasattr(key, '__len__') and len(self) != len(key): - return self.take(key) - else: - data_slice = self.values[key] - - return self._constructor(data_slice) - - def __getslice__(self, i, j): - if i < 0: - i = 0 - if j < 0: - j = 0 - slobj = slice(i, j) - return self.__getitem__(slobj) - - def _get_val_at(self, loc): - n = len(self) - if loc < 0: - loc += n - - if loc >= n or loc < 0: - raise IndexError('Out of bounds access') - - sp_loc = self.sp_index.lookup(loc) - if sp_loc == -1: - return self.fill_value - else: - # libindex.get_value_at will end up calling __getitem__, - # so to avoid recursing we need to unwrap `self` so the - # ndarray.__getitem__ implementation is called. - return libindex.get_value_at(np.asarray(self), sp_loc) - - @Appender(_index_shared_docs['take'] % _sparray_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - """ - Sparse-compatible version of ndarray.take - - Returns - ------- - taken : ndarray - """ - nv.validate_take(tuple(), kwargs) - - if axis: - raise ValueError("axis must be 0, input was {axis}" - .format(axis=axis)) - - if is_integer(indices): - # return scalar - return self[indices] - - indices = ensure_platform_int(indices) - n = len(self) - if allow_fill and fill_value is not None: - # allow -1 to indicate self.fill_value, - # self.fill_value may not be NaN - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - elif (n <= indices).any(): - msg = 'index is out of bounds for size {size}'.format(size=n) - raise IndexError(msg) - else: - if ((indices < -n) | (n <= indices)).any(): - msg = 'index is out of bounds for size {size}'.format(size=n) - raise IndexError(msg) - - indices = indices.astype(np.int32) - if not (allow_fill and fill_value is not None): - indices = indices.copy() - indices[indices < 0] += n - - locs = self.sp_index.lookup_array(indices) - indexer = np.arange(len(locs), dtype=np.int32) - mask = locs != -1 - if mask.any(): - indexer = indexer[mask] - new_values = self.sp_values.take(locs[mask]) - else: - indexer = np.empty(shape=(0, ), dtype=np.int32) - new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype) - - sp_index = _make_index(len(indices), indexer, kind=self.sp_index) - return self._simple_new(new_values, sp_index, self.fill_value) - - def __setitem__(self, key, value): - # if is_integer(key): - # self.values[key] = value - # else: - # raise Exception("SparseArray does not support setting non-scalars - # via setitem") - raise TypeError( - "SparseArray does not support item assignment via setitem") - - def __setslice__(self, i, j, value): - if i < 0: - i = 0 - if j < 0: - j = 0 - slobj = slice(i, j) # noqa - - # if not is_scalar(value): - # raise Exception("SparseArray does not support setting non-scalars - # via slices") - - # x = self.values - # x[slobj] = value - # self.values = x - raise TypeError("SparseArray does not support item assignment via " - "slices") - - def astype(self, dtype=None, copy=True): - dtype = np.dtype(dtype) - sp_values = astype_nansafe(self.sp_values, dtype, copy=copy) - try: - if is_bool_dtype(dtype): - # to avoid np.bool_ dtype - fill_value = bool(self.fill_value) - else: - fill_value = dtype.type(self.fill_value) - except ValueError: - msg = 'unable to coerce current fill_value {fill} to {dtype} dtype' - raise ValueError(msg.format(fill=self.fill_value, dtype=dtype)) - return self._simple_new(sp_values, self.sp_index, - fill_value=fill_value) - - def copy(self, deep=True): - """ - Make a copy of the SparseArray. Only the actual sparse values need to - be copied. - """ - if deep: - values = self.sp_values.copy() - else: - values = self.sp_values - return SparseArray(values, sparse_index=self.sp_index, - dtype=self.dtype, fill_value=self.fill_value) - - def count(self): - """ - Compute sum of non-NA/null observations in SparseArray. If the - fill_value is not NaN, the "sparse" locations will be included in the - observation count. - - Returns - ------- - nobs : int - """ - sp_values = self.sp_values - valid_spvals = np.isfinite(sp_values).sum() - if self._null_fill_value: - return valid_spvals - else: - return valid_spvals + self.sp_index.ngaps - - @property - def _null_fill_value(self): - return isna(self.fill_value) - - @property - def _valid_sp_values(self): - sp_vals = self.sp_values - mask = notna(sp_vals) - return sp_vals[mask] - - @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs) - def fillna(self, value, downcast=None): - if downcast is not None: - raise NotImplementedError - - if issubclass(self.dtype.type, np.floating): - value = float(value) - - new_values = np.where(isna(self.sp_values), value, self.sp_values) - fill_value = value if self._null_fill_value else self.fill_value - - return self._simple_new(new_values, self.sp_index, - fill_value=fill_value) - - def all(self, axis=0, *args, **kwargs): - """ - Tests whether all elements evaluate True - - Returns - ------- - all : bool - - See Also - -------- - numpy.all - """ - nv.validate_all(args, kwargs) - - values = self.sp_values - - if len(values) != len(self) and not np.all(self.fill_value): - return False - - return values.all() - - def any(self, axis=0, *args, **kwargs): - """ - Tests whether at least one of elements evaluate True - - Returns - ------- - any : bool - - See Also - -------- - numpy.any - """ - nv.validate_any(args, kwargs) - - values = self.sp_values - - if len(values) != len(self) and np.any(self.fill_value): - return True - - return values.any() - - def sum(self, axis=0, *args, **kwargs): - """ - Sum of non-NA/null values - - Returns - ------- - sum : float - """ - nv.validate_sum(args, kwargs) - valid_vals = self._valid_sp_values - sp_sum = valid_vals.sum() - if self._null_fill_value: - return sp_sum - else: - nsparse = self.sp_index.ngaps - return sp_sum + self.fill_value * nsparse - - def cumsum(self, axis=0, *args, **kwargs): - """ - Cumulative sum of non-NA/null values. - - When performing the cumulative summation, any non-NA/null values will - be skipped. The resulting SparseArray will preserve the locations of - NaN values, but the fill value will be `np.nan` regardless. - - Parameters - ---------- - axis : int or None - Axis over which to perform the cumulative summation. If None, - perform cumulative summation over flattened array. - - Returns - ------- - cumsum : SparseArray - """ - nv.validate_cumsum(args, kwargs) - - if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. - raise ValueError("axis(={axis}) out of bounds".format(axis=axis)) - - if not self._null_fill_value: - return SparseArray(self.to_dense()).cumsum() - - return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, - fill_value=self.fill_value) - - def mean(self, axis=0, *args, **kwargs): - """ - Mean of non-NA/null values - - Returns - ------- - mean : float - """ - nv.validate_mean(args, kwargs) - valid_vals = self._valid_sp_values - sp_sum = valid_vals.sum() - ct = len(valid_vals) - - if self._null_fill_value: - return sp_sum / ct - else: - nsparse = self.sp_index.ngaps - return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) - - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of unique values. - - Parameters - ---------- - dropna : boolean, default True - Don't include counts of NaN, even if NaN is in sp_values. - - Returns - ------- - counts : Series - """ - keys, counts = algos._value_counts_arraylike(self.sp_values, - dropna=dropna) - fcounts = self.sp_index.ngaps - if fcounts > 0: - if self._null_fill_value and dropna: - pass - else: - if self._null_fill_value: - mask = pd.isna(keys) - else: - mask = keys == self.fill_value - - if mask.any(): - counts[mask] += fcounts - else: - keys = np.insert(keys, 0, self.fill_value) - counts = np.insert(counts, 0, fcounts) - - if not isinstance(keys, pd.Index): - keys = pd.Index(keys) - result = pd.Series(counts, index=keys) - return result - - -def _maybe_to_dense(obj): - """ try to convert to dense """ - if hasattr(obj, 'to_dense'): - return obj.to_dense() - return obj - - -def _maybe_to_sparse(array): - """ array must be SparseSeries or SparseArray """ - if isinstance(array, ABCSparseSeries): - array = array.values.copy() - return array - - -def _sanitize_values(arr): - """ - return an ndarray for our input, - in a platform independent manner - """ - - if hasattr(arr, 'values'): - arr = arr.values - else: - - # scalar - if is_scalar(arr): - arr = [arr] - - # ndarray - if isinstance(arr, np.ndarray): - pass - - elif is_list_like(arr) and len(arr) > 0: - arr = maybe_convert_platform(arr) - - else: - arr = np.asarray(arr) - - return arr - - -def make_sparse(arr, kind='block', fill_value=None): - """ - Convert ndarray to sparse format - - Parameters - ---------- - arr : ndarray - kind : {'block', 'integer'} - fill_value : NaN or another value - - Returns - ------- - (sparse_values, index) : (ndarray, SparseIndex) - """ - - arr = _sanitize_values(arr) - - if arr.ndim > 1: - raise TypeError("expected dimension <= 1 data") - - if fill_value is None: - fill_value = na_value_for_dtype(arr.dtype) - - if isna(fill_value): - mask = notna(arr) - else: - # For str arrays in NumPy 1.12.0, operator!= below isn't - # element-wise but just returns False if fill_value is not str, - # so cast to object comparison to be safe - if is_string_dtype(arr): - arr = arr.astype(object) - - if is_object_dtype(arr.dtype): - # element-wise equality check method in numpy doesn't treat - # each element type, eg. 0, 0.0, and False are treated as - # same. So we have to check the both of its type and value. - mask = splib.make_mask_object_ndarray(arr, fill_value) - else: - mask = arr != fill_value - - length = len(arr) - if length != mask.size: - # the arr is a SparseArray - indices = mask.sp_index.indices - else: - indices = mask.nonzero()[0].astype(np.int32) - - index = _make_index(length, indices, kind) - sparsified_values = arr[mask] - return sparsified_values, index, fill_value - - -def _make_index(length, indices, kind): - - if kind == 'block' or isinstance(kind, BlockIndex): - locs, lens = splib.get_blocks(indices) - index = BlockIndex(length, locs, lens) - elif kind == 'integer' or isinstance(kind, IntIndex): - index = IntIndex(length, indices) - else: # pragma: no cover - raise ValueError('must be block or integer type') - return index - - -ops.add_special_arithmetic_methods(SparseArray) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 58e3001bcfe6a..586193fe11850 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -3,31 +3,37 @@ with float64 data """ from __future__ import division -# pylint: disable=E1101,E1103,W0231,E0202 import warnings -from pandas.compat import lmap -from pandas import compat + import numpy as np -from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.cast import maybe_upcast, find_common_type +from pandas._libs.sparse import BlockIndex, get_blocks +import pandas.compat as compat +from pandas.compat import lmap +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender + +from pandas.core.dtypes.cast import find_common_type, maybe_upcast from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse +from pandas.core.dtypes.missing import isna, notna -from pandas.compat.numpy import function as nv -from pandas.core.index import Index, MultiIndex, ensure_index -from pandas.core.series import Series -from pandas.core.frame import DataFrame, extract_index, _prep_ndarray import pandas.core.algorithms as algos -from pandas.core.internals import (BlockManager, - create_block_manager_from_arrays) -import pandas.core.generic as generic -from pandas.core.sparse.series import SparseSeries, SparseArray -from pandas._libs.sparse import BlockIndex, get_blocks -from pandas.util._decorators import Appender -import pandas.core.ops as ops +from pandas.core.arrays.sparse import SparseArray, SparseDtype import pandas.core.common as com +from pandas.core.frame import DataFrame +import pandas.core.generic as generic +from pandas.core.index import Index, MultiIndex, ensure_index import pandas.core.indexes.base as ibase +from pandas.core.internals import ( + BlockManager, create_block_manager_from_arrays) +from pandas.core.internals.construction import extract_index, prep_ndarray +import pandas.core.ops as ops +from pandas.core.series import Series +from pandas.core.sparse.series import SparseSeries + +# pylint: disable=E1101,E1103,W0231,E0202 + _shared_doc_kwargs = dict(klass='SparseDataFrame') @@ -169,20 +175,27 @@ def sp_maker(x): v = [v.get(i, np.nan) for i in index] v = sp_maker(v) + + if index is not None and len(v) != len(index): + msg = "Length of passed values is {}, index implies {}" + raise ValueError(msg.format(len(v), len(index))) sdict[k] = v - # TODO: figure out how to handle this case, all nan's? - # add in any other columns we want to have (completeness) - nan_arr = np.empty(len(index), dtype='float64') - nan_arr.fill(np.nan) - nan_arr = sp_maker(nan_arr) - sdict.update((c, nan_arr) for c in columns if c not in sdict) + if len(columns.difference(sdict)): + # TODO: figure out how to handle this case, all nan's? + # add in any other columns we want to have (completeness) + nan_arr = np.empty(len(index), dtype='float64') + nan_arr.fill(np.nan) + nan_arr = SparseArray(nan_arr, kind=self._default_kind, + fill_value=self._default_fill_value, + copy=False) + sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index) def _init_matrix(self, data, index, columns, dtype=None): """ Init self from ndarray or list of lists """ - data = _prep_ndarray(data, copy=False) + data = prep_ndarray(data, copy=False) index, columns = self._prep_index(data, index, columns) data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) @@ -260,6 +273,9 @@ def to_coo(self): raise ImportError('Scipy is not installed') dtype = find_common_type(self.dtypes) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + cols, rows, datas = [], [], [] for col, name in enumerate(self): s = self[name] @@ -324,9 +340,8 @@ def to_dense(self): def _apply_columns(self, func): """ get new SparseDataFrame applying func to each columns """ - new_data = {} - for col, series in compat.iteritems(self): - new_data[col] = func(series) + new_data = {col: func(series) + for col, series in compat.iteritems(self)} return self._constructor( data=new_data, index=self.index, columns=self.columns, @@ -537,12 +552,12 @@ def xs(self, key, axis=0, copy=False): # Arithmetic-related methods def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join='outer', level=level, copy=False) - new_index, new_columns = this.index, this.columns - if level is not None: raise NotImplementedError("'level' argument is not supported") + this, other = self.align(other, join='outer', level=level, copy=False) + new_index, new_columns = this.index, this.columns + if self.empty and other.empty: return self._constructor(index=new_index).__finalize__(self) @@ -562,17 +577,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): if col in this and col in other: new_data[col] = func(this[col], other[col]) - # if the fill values are the same use them? or use a valid one - new_fill_value = None - other_fill_value = getattr(other, 'default_fill_value', np.nan) - if self.default_fill_value == other_fill_value: - new_fill_value = self.default_fill_value - elif np.isnan(self.default_fill_value) and not np.isnan( - other_fill_value): - new_fill_value = other_fill_value - elif not np.isnan(self.default_fill_value) and np.isnan( - other_fill_value): - new_fill_value = self.default_fill_value + new_fill_value = self._get_op_result_fill_value(other, func) return self._constructor(data=new_data, index=new_index, columns=new_columns, @@ -585,29 +590,19 @@ def _combine_match_index(self, other, func, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - new_index = self.index.union(other.index) - this = self - if self.index is not new_index: - this = self.reindex(new_index) - - if other.index is not new_index: - other = other.reindex(new_index) + this, other = self.align(other, join='outer', axis=0, level=level, + copy=False) for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) - # fill_value is a function of our operator - if isna(other.fill_value) or isna(self.default_fill_value): - fill_value = np.nan - else: - fill_value = func(np.float64(self.default_fill_value), - np.float64(other.fill_value)) + fill_value = self._get_op_result_fill_value(other, func) return self._constructor( - new_data, index=new_index, columns=self.columns, + new_data, index=this.index, columns=self.columns, default_fill_value=fill_value).__finalize__(self) - def _combine_match_columns(self, other, func, level=None, try_cast=True): + def _combine_match_columns(self, other, func, level=None): # patched version of DataFrame._combine_match_columns to account for # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, # where 3.0 is numpy.float64 and series is a SparseSeries. Still @@ -616,24 +611,56 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): if level is not None: raise NotImplementedError("'level' argument is not supported") - new_data = {} - - union = intersection = self.columns + left, right = self.align(other, join='outer', axis=1, level=level, + copy=False) + assert left.columns.equals(right.index) - if not union.equals(other.index): - union = other.index.union(self.columns) - intersection = other.index.intersection(self.columns) + new_data = {} - for col in intersection: - new_data[col] = func(self[col], float(other[col])) + for col in left.columns: + new_data[col] = func(left[col], float(right[col])) return self._constructor( - new_data, index=self.index, columns=union, + new_data, index=left.index, columns=left.columns, default_fill_value=self.default_fill_value).__finalize__(self) - def _combine_const(self, other, func, errors='raise', try_cast=True): + def _combine_const(self, other, func): return self._apply_columns(lambda x: func(x, other)) + def _get_op_result_fill_value(self, other, func): + own_default = self.default_fill_value + + if isinstance(other, DataFrame): + # i.e. called from _combine_frame + + other_default = getattr(other, 'default_fill_value', np.nan) + + # if the fill values are the same use them? or use a valid one + if own_default == other_default: + # TOOD: won't this evaluate as False if both are np.nan? + fill_value = own_default + elif np.isnan(own_default) and not np.isnan(other_default): + fill_value = other_default + elif not np.isnan(own_default) and np.isnan(other_default): + fill_value = own_default + else: + fill_value = None + + elif isinstance(other, SparseSeries): + # i.e. called from _combine_match_index + + # fill_value is a function of our operator + if isna(other.fill_value) or isna(own_default): + fill_value = np.nan + else: + fill_value = func(np.float64(own_default), + np.float64(other.fill_value)) + + else: + raise NotImplementedError(type(other)) + + return fill_value + def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: @@ -940,7 +967,7 @@ def stack_sparse_frame(frame): nobs = sum(lengths) # this is pretty fast - minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) + minor_codes = np.repeat(np.arange(len(frame.columns)), lengths) inds_to_concat = [] vals_to_concat = [] @@ -955,10 +982,10 @@ def stack_sparse_frame(frame): inds_to_concat.append(int_index.indices) vals_to_concat.append(series.sp_values) - major_labels = np.concatenate(inds_to_concat) + major_codes = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) index = MultiIndex(levels=[frame.index, frame.columns], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], verify_integrity=False) lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 748a52f484893..2d0ce2d5e5951 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -3,10 +3,11 @@ Currently only includes SparseSeries.to_coo helpers. """ -from pandas.core.index import MultiIndex, Index -from pandas.core.series import Series from pandas.compat import OrderedDict, lmap +from pandas.core.index import Index, MultiIndex +from pandas.core.series import Series + def _check_is_partition(parts, whole): whole = set(whole) @@ -57,15 +58,7 @@ def _get_label_to_i_dict(labels, sort_labels=False): return (d) def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): - def robust_get_level_values(i): - # if index has labels (that are not None) use those, - # else use the level location - try: - return index.get_level_values(index.names[i]) - except KeyError: - return index.get_level_values(i) - - ilabels = list(zip(*[robust_get_level_values(i) for i in subset])) + ilabels = list(zip(*[index._get_level_values(i) for i in subset])) labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) labels_to_i = Series(labels_to_i) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 97cd3a0a1fb6a..4ea4531c53c72 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -5,32 +5,30 @@ # pylint: disable=E1101,E1103,W0231 -import numpy as np import warnings -from pandas.core.dtypes.missing import isna, notna +import numpy as np -from pandas.compat.numpy import function as nv -from pandas.core.index import Index, ensure_index, InvalidIndexError -from pandas.core.series import Series -from pandas.core.internals import SingleBlockManager -from pandas.core import generic -import pandas.core.common as com -import pandas.core.indexes.base as ibase -import pandas.core.ops as ops import pandas._libs.index as libindex +import pandas._libs.sparse as splib +from pandas._libs.sparse import BlockIndex, IntIndex +import pandas.compat as compat +from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution -from pandas.core.sparse.array import ( - make_sparse, SparseArray, - _make_index) -from pandas._libs.sparse import BlockIndex, IntIndex -import pandas._libs.sparse as splib +from pandas.core.dtypes.common import is_scalar +from pandas.core.dtypes.generic import ABCSeries, ABCSparseSeries +from pandas.core.dtypes.missing import is_integer, isna, notna +from pandas.core import generic +from pandas.core.arrays import SparseArray +from pandas.core.arrays.sparse import SparseAccessor +from pandas.core.index import Index +from pandas.core.internals import SingleBlockManager +import pandas.core.ops as ops +from pandas.core.series import Series from pandas.core.sparse.scipy_sparse import ( - _sparse_series_to_coo, - _coo_to_sparse_series) - + _coo_to_sparse_series, _sparse_series_to_coo) _shared_doc_kwargs = dict(axes='index', klass='SparseSeries', axes_single_arg="{0, 'index'}", @@ -65,142 +63,114 @@ class SparseSeries(Series): def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): + # TODO: Most of this should be refactored and shared with Series + # 1. BlockManager -> array + # 2. Series.index, Series.name, index, name reconciliation + # 3. Implicit reindexing + # 4. Implicit broadcasting + # 5. Dict construction + if data is None: + data = [] + elif isinstance(data, SingleBlockManager): + index = data.index + data = data.blocks[0].values + elif isinstance(data, (ABCSeries, ABCSparseSeries)): + index = data.index if index is None else index + dtype = data.dtype if dtype is None else dtype + name = data.name if name is None else name + + if index is not None: + data = data.reindex(index) + + elif isinstance(data, compat.Mapping): + data, index = Series()._init_dict(data, index=index) + + elif is_scalar(data) and index is not None: + data = np.full(len(index), fill_value=data) + + super(SparseSeries, self).__init__( + SparseArray(data, + sparse_index=sparse_index, + kind=kind, + dtype=dtype, + fill_value=fill_value, + copy=copy), + index=index, name=name, + copy=False, fastpath=fastpath + ) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # avoid infinite recursion for other SparseSeries inputs + inputs = tuple( + x.values if isinstance(x, type(self)) else x + for x in inputs + ) + result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs) + return self._constructor(result, index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False).__finalize__(self) - # we are called internally, so short-circuit - if fastpath: - - # data is an ndarray, index is defined - - if not isinstance(data, SingleBlockManager): - data = SingleBlockManager(data, index, fastpath=True) - if copy: - data = data.copy() - - else: - - if data is None: - data = [] - - if isinstance(data, Series) and name is None: - name = data.name - - if isinstance(data, SparseArray): - if index is not None: - assert (len(index) == len(data)) - sparse_index = data.sp_index - if fill_value is None: - fill_value = data.fill_value - - data = np.asarray(data) - - elif isinstance(data, SparseSeries): - if index is None: - index = data.index.view() - if fill_value is None: - fill_value = data.fill_value - # extract the SingleBlockManager - data = data._data - - elif isinstance(data, (Series, dict)): - data = Series(data, index=index) - index = data.index.view() - - res = make_sparse(data, kind=kind, fill_value=fill_value) - data, sparse_index, fill_value = res - - elif isinstance(data, (tuple, list, np.ndarray)): - # array-like - if sparse_index is None: - res = make_sparse(data, kind=kind, fill_value=fill_value) - data, sparse_index, fill_value = res - else: - assert (len(data) == sparse_index.npoints) - - elif isinstance(data, SingleBlockManager): - if dtype is not None: - data = data.astype(dtype) - if index is None: - index = data.index.view() - elif not data.index.equals(index) or copy: # pragma: no cover - # GH#19275 SingleBlockManager input should only be called - # internally - raise AssertionError('Cannot pass both SingleBlockManager ' - '`data` argument and a different ' - '`index` argument. `copy` must ' - 'be False.') - - else: - length = len(index) - - if data == fill_value or (isna(data) and isna(fill_value)): - if kind == 'block': - sparse_index = BlockIndex(length, [], []) - else: - sparse_index = IntIndex(length, []) - data = np.array([]) - - else: - if kind == 'block': - locs, lens = ([0], [length]) if length else ([], []) - sparse_index = BlockIndex(length, locs, lens) - else: - sparse_index = IntIndex(length, index) - v = data - data = np.empty(length) - data.fill(v) - - if index is None: - index = ibase.default_index(sparse_index.length) - index = ensure_index(index) - - # create/copy the manager - if isinstance(data, SingleBlockManager): - - if copy: - data = data.copy() - else: - - # create a sparse array - if not isinstance(data, SparseArray): - data = SparseArray(data, sparse_index=sparse_index, - fill_value=fill_value, dtype=dtype, - copy=copy) - - data = SingleBlockManager(data, index) + def __array_wrap__(self, result, context=None): + """ + Gets called prior to a ufunc (and after) - generic.NDFrame.__init__(self, data) + See SparseArray.__array_wrap__ for detail. + """ + result = self.values.__array_wrap__(result, context=context) + return self._constructor(result, index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False).__finalize__(self) - self.index = index - self.name = name + def __array_finalize__(self, obj): + """ + Gets called after any ufunc or other array operations, necessary + to pass on the index. + """ + self.name = getattr(obj, 'name', None) + self.fill_value = getattr(obj, 'fill_value', None) - @property - def values(self): - """ return the array """ - return self.block.values + # unary ops + # TODO: See if this can be shared + def __pos__(self): + result = self.values.__pos__() + return self._constructor(result, index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False).__finalize__(self) - def __array__(self, result=None): - """ the array interface, return my values """ - return self.block.values + def __neg__(self): + result = self.values.__neg__() + return self._constructor(result, index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False).__finalize__(self) - def get_values(self): - """ same as values """ - return self.block.to_dense().view() + def __invert__(self): + result = self.values.__invert__() + return self._constructor(result, index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False).__finalize__(self) @property def block(self): + warnings.warn("SparseSeries.block is deprecated.", FutureWarning, + stacklevel=2) return self._data._block @property def fill_value(self): - return self.block.fill_value + return self.values.fill_value @fill_value.setter def fill_value(self, v): - self.block.fill_value = v + self.values.fill_value = v @property def sp_index(self): - return self.block.sp_index + return self.values.sp_index @property def sp_values(self): @@ -208,7 +178,7 @@ def sp_values(self): @property def npoints(self): - return self.sp_index.npoints + return self.values.npoints @classmethod def from_array(cls, arr, index=None, name=None, copy=False, @@ -250,13 +220,6 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False): return SparseArray(self.values, sparse_index=self.sp_index, fill_value=fill_value, kind=kind, copy=copy) - def __len__(self): - return len(self.block) - - @property - def shape(self): - return self._data.shape - def __unicode__(self): # currently, unicode is same as repr...fixes infinite loop series_rep = Series.__unicode__(self) @@ -264,33 +227,6 @@ def __unicode__(self): index=self.sp_index) return rep - def __array_wrap__(self, result, context=None): - """ - Gets called prior to a ufunc (and after) - - See SparseArray.__array_wrap__ for detail. - """ - if isinstance(context, tuple) and len(context) == 3: - ufunc, args, domain = context - args = [getattr(a, 'fill_value', a) for a in args] - with np.errstate(all='ignore'): - fill_value = ufunc(self.fill_value, *args[1:]) - else: - fill_value = self.fill_value - - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=fill_value, - copy=False).__finalize__(self) - - def __array_finalize__(self, obj): - """ - Gets called after any ufunc or other array operations, necessary - to pass on the index. - """ - self.name = getattr(obj, 'name', None) - self.fill_value = getattr(obj, 'fill_value', None) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): """ perform a reduction operation """ @@ -326,10 +262,6 @@ def _unpickle_series_compat(self, state): self._set_axis(0, index) self.name = name - def __iter__(self): - """ forward to the array """ - return iter(self.values) - def _set_subtyp(self, is_all_dates): if is_all_dates: object.__setattr__(self, '_subtyp', 'sparse_time_series') @@ -356,31 +288,15 @@ def _ixs(self, i, axis=0): def _get_val_at(self, loc): """ forward to the array """ - return self.block.values._get_val_at(loc) + return self.values._get_val_at(loc) def __getitem__(self, key): - try: - return self.index.get_value(self, key) - - except InvalidIndexError: - pass - except KeyError: - if isinstance(key, (int, np.integer)): - return self._get_val_at(key) - elif key is Ellipsis: - return self - raise Exception('Requested index not in this series!') - - except TypeError: - # Could not hash item, must be array-like? - pass - - key = com.values_from_object(key) - if self.index.nlevels > 1 and isinstance(key, tuple): - # to handle MultiIndex labels - key = self.index.get_loc(key) - return self._constructor(self.values[key], - index=self.index[key]).__finalize__(self) + # TODO: Document difference from Series.__getitem__, deprecate, + # and remove! + if is_integer(key) and key not in self.index: + return self._get_val_at(key) + else: + return super(SparseSeries, self).__getitem__(key) def _get_values(self, indexer): try: @@ -518,56 +434,39 @@ def _set_values(self, key, value): kind=self.kind) self._data = SingleBlockManager(values, self.index) - def to_dense(self, sparse_only=False): + def to_dense(self): """ Convert SparseSeries to a Series. - Parameters - ---------- - sparse_only : bool, default False - .. deprecated:: 0.20.0 - This argument will be removed in a future version. - - If True, return just the non-sparse values, or the dense version - of `self.values` if False. - Returns ------- s : Series """ - if sparse_only: - warnings.warn(("The 'sparse_only' parameter has been deprecated " - "and will be removed in a future version."), - FutureWarning, stacklevel=2) - int_index = self.sp_index.to_int_index() - index = self.index.take(int_index.indices) - return Series(self.sp_values, index=index, name=self.name) - else: - return Series(self.values.to_dense(), index=self.index, - name=self.name) + return Series(self.values.to_dense(), index=self.index, + name=self.name) @property def density(self): - r = float(self.sp_index.npoints) / float(self.sp_index.length) - return r + return self.values.density def copy(self, deep=True): """ Make a copy of the SparseSeries. Only the actual sparse values need to be copied """ - new_data = self._data - if deep: - new_data = self._data.copy() - + # TODO: https://github.com/pandas-dev/pandas/issues/22314 + # We skip the block manager till that is resolved. + new_data = self.values.copy(deep=deep) return self._constructor(new_data, sparse_index=self.sp_index, - fill_value=self.fill_value).__finalize__(self) + fill_value=self.fill_value, + index=self.index.copy(), + name=self.name).__finalize__(self) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) def reindex(self, index=None, method=None, copy=True, limit=None, **kwargs): - + # TODO: remove? return super(SparseSeries, self).reindex(index=index, method=method, copy=copy, limit=limit, **kwargs) @@ -585,28 +484,14 @@ def sparse_reindex(self, new_index): reindexed : SparseSeries """ if not isinstance(new_index, splib.SparseIndex): - raise TypeError('new index must be a SparseIndex') - - block = self.block.sparse_reindex(new_index) - new_data = SingleBlockManager(block, self.index) - return self._constructor(new_data, index=self.index, - sparse_index=new_index, - fill_value=self.fill_value).__finalize__(self) - - @Appender(generic.NDFrame.take.__doc__) - def take(self, indices, axis=0, convert=None, *args, **kwargs): - if convert is not None: - msg = ("The 'convert' parameter is deprecated " - "and will be removed in a future version.") - warnings.warn(msg, FutureWarning, stacklevel=2) - else: - convert = True - - nv.validate_take_with_convert(convert, args, kwargs) - new_values = SparseArray.take(self.values, indices) - new_index = self.index.take(indices) - return self._constructor(new_values, - index=new_index).__finalize__(self) + raise TypeError("new index must be a SparseIndex") + values = self.values + values = values.sp_index.to_int_index().reindex( + values.sp_values.astype('float64'), values.fill_value, new_index) + values = SparseArray(values, + sparse_index=new_index, + fill_value=self.values.fill_value) + return self._constructor(values, index=self.index).__finalize__(self) def cumsum(self, axis=0, *args, **kwargs): """ @@ -635,12 +520,14 @@ def cumsum(self, axis=0, *args, **kwargs): new_array, index=self.index, sparse_index=new_array.sp_index).__finalize__(self) + # TODO: SparseSeries.isna is Sparse, while Series.isna is dense @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) def isna(self): arr = SparseArray(isna(self.values.sp_values), sparse_index=self.values.sp_index, fill_value=isna(self.fill_value)) return self._constructor(arr, index=self.index).__finalize__(self) + isnull = isna @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) @@ -668,35 +555,6 @@ def dropna(self, axis=0, inplace=False, **kwargs): dense_valid = dense_valid[dense_valid != self.fill_value] return dense_valid.to_sparse(fill_value=self.fill_value) - @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) - def shift(self, periods, freq=None, axis=0): - if periods == 0: - return self.copy() - - # no special handling of fill values yet - if not isna(self.fill_value): - shifted = self.to_dense().shift(periods, freq=freq, - axis=axis) - return shifted.to_sparse(fill_value=self.fill_value, - kind=self.kind) - - if freq is not None: - return self._constructor( - self.sp_values, sparse_index=self.sp_index, - index=self.index.shift(periods, freq), - fill_value=self.fill_value).__finalize__(self) - - int_index = self.sp_index.to_int_index() - new_indices = int_index.indices + periods - start, end = new_indices.searchsorted([0, int_index.length]) - - new_indices = new_indices[start:end] - new_sp_index = _make_index(len(self), new_indices, self.sp_index) - - arr = self.values._simple_new(self.sp_values[start:end].copy(), - new_sp_index, fill_value=np.nan) - return self._constructor(arr, index=self.index).__finalize__(self) - def combine_first(self, other): """ Combine Series values, choosing the calling Series's values @@ -716,99 +574,16 @@ def combine_first(self, other): dense_combined = self.to_dense().combine_first(other) return dense_combined.to_sparse(fill_value=self.fill_value) + @Appender(SparseAccessor.to_coo.__doc__) def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): - """ - Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. - - Use row_levels and column_levels to determine the row and column - coordinates respectively. row_levels and column_levels are the names - (labels) or numbers of the levels. {row_levels, column_levels} must be - a partition of the MultiIndex level names (or numbers). - - Parameters - ---------- - row_levels : tuple/list - column_levels : tuple/list - sort_labels : bool, default False - Sort the row and column labels before forming the sparse matrix. - - Returns - ------- - y : scipy.sparse.coo_matrix - rows : list (row labels) - columns : list (column labels) - - Examples - -------- - >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) - >>> ss = s.to_sparse() - >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=True) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 3.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> rows - [(1, 1), (1, 2), (2, 1)] - >>> columns - [('a', 0), ('a', 1), ('b', 0), ('b', 1)] - """ A, rows, columns = _sparse_series_to_coo(self, row_levels, column_levels, sort_labels=sort_labels) return A, rows, columns @classmethod + @Appender(SparseAccessor.from_coo.__doc__) def from_coo(cls, A, dense_index=False): - """ - Create a SparseSeries from a scipy.sparse.coo_matrix. - - Parameters - ---------- - A : scipy.sparse.coo_matrix - dense_index : bool, default False - If False (default), the SparseSeries index consists of only the - coords of the non-null entries of the original coo_matrix. - If True, the SparseSeries index consists of the full sorted - (row, col) coordinates of the coo_matrix. - - Returns - ------- - s : SparseSeries - - Examples - --------- - >>> from scipy import sparse - >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 2.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> ss = pd.SparseSeries.from_coo(A) - >>> ss - 0 2 1 - 3 2 - 1 0 3 - dtype: float64 - BlockIndex - Block locations: array([0], dtype=int32) - Block lengths: array([3], dtype=int32) - """ return _coo_to_sparse_series(A, dense_index=dense_index) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5a23951145cb4..d3d38d26ee86b 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,30 +1,26 @@ # -*- coding: utf-8 -*- +import codecs +import re +import textwrap +import warnings + import numpy as np +import pandas._libs.lib as lib +import pandas._libs.ops as libops +import pandas.compat as compat from pandas.compat import zip -from pandas.core.dtypes.generic import ABCSeries, ABCIndex -from pandas.core.dtypes.missing import isna, notna +from pandas.util._decorators import Appender, deprecate_kwarg + from pandas.core.dtypes.common import ( - is_bool_dtype, - is_categorical_dtype, - is_object_dtype, - is_string_like, - is_list_like, - is_scalar, - is_integer, - is_re) + ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, + is_list_like, is_object_dtype, is_re, is_scalar, is_string_like) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna -import pandas.core.common as com from pandas.core.algorithms import take_1d -import pandas.compat as compat from pandas.core.base import NoNewAttributesMixin -from pandas.util._decorators import Appender -import re -import pandas._libs.lib as lib -import pandas._libs.ops as libops -import warnings -import textwrap -import codecs +import pandas.core.common as com _cpython_optimized_encoders = ( "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" @@ -36,114 +32,26 @@ _shared_docs = dict() -def _get_array_list(arr, others): - """ - Auxiliary function for :func:`str_cat` - - Parameters - ---------- - arr : ndarray - The left-most ndarray of the concatenation - others : list, ndarray, Series - The rest of the content to concatenate. If list of list-likes, - all elements must be passable to ``np.asarray``. - - Returns - ------- - list - List of all necessary arrays - """ - from pandas.core.series import Series - - if len(others) and isinstance(com.values_from_object(others)[0], - (list, np.ndarray, Series)): - arrays = [arr] + list(others) - else: - arrays = [arr, others] - - return [np.asarray(x, dtype=object) for x in arrays] - - -def str_cat(arr, others=None, sep=None, na_rep=None): +def cat_core(list_of_columns, sep): """ Auxiliary function for :meth:`str.cat` - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not being passed then all values in the Series are - concatenated in a single string with a given `sep`. - Parameters ---------- - others : list-like, or list of list-likes, optional - List-likes (or a list of them) of the same length as calling object. - If None, returns str concatenating strings of the Series. - sep : string or None, default None - If None, concatenates without any separator. - na_rep : string or None, default None - If None, NA in the series are ignored. + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns Returns ------- - concat - ndarray containing concatenated results (if `others is not None`) - or str (if `others is None`) + nd.array + The concatenation of list_of_columns with sep """ - if sep is None: - sep = '' - - if others is not None: - arrays = _get_array_list(arr, others) - - n = _length_check(arrays) - masks = np.array([isna(x) for x in arrays]) - cats = None - - if na_rep is None: - na_mask = np.logical_or.reduce(masks, axis=0) - - result = np.empty(n, dtype=object) - np.putmask(result, na_mask, np.nan) - - notmask = ~na_mask - - tuples = zip(*[x[notmask] for x in arrays]) - cats = [sep.join(tup) for tup in tuples] - - result[notmask] = cats - else: - for i, x in enumerate(arrays): - x = np.where(masks[i], na_rep, x) - if cats is None: - cats = x - else: - cats = cats + sep + x - - result = cats - - return result - else: - arr = np.asarray(arr, dtype=object) - mask = isna(arr) - if na_rep is None and mask.any(): - if sep == '': - na_rep = '' - else: - return sep.join(arr[notna(arr)]) - return sep.join(np.where(mask, na_rep, arr)) - - -def _length_check(others): - n = None - for x in others: - try: - if n is None: - n = len(x) - elif len(x) != n: - raise ValueError('All arrays must be same length') - except TypeError: - raise ValueError('Must pass arrays containing strings to str_cat') - return n + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + return np.sum(list_with_sep, axis=0) def _na_map(f, arr, na_result=np.nan, dtype=object): @@ -292,7 +200,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): See Also -------- - match : analogous, but stricter, relying on re.match instead of re.search + match : Analogous, but stricter, relying on re.match instead of re.search. Series.str.startswith : Test if the start of each string element matches a pattern. Series.str.endswith : Same as startswith, but tests the end of string. @@ -562,7 +470,6 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): A copy of the object with all matching occurrences of `pat` replaced by `repl`. - Raises ------ ValueError @@ -636,7 +543,6 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): 1 bar 2 NaN dtype: object - """ # Check whether repl is valid (GH 13438, GH 15055) @@ -746,7 +652,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): If True, case sensitive flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE - na : default NaN, fill value for missing values. + na : default NaN, fill value for missing values Returns ------- @@ -757,7 +663,6 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): contains : analogous, but less strict, relying on re.search instead of re.match extract : extract matched groups - """ if not case: flags |= re.IGNORECASE @@ -890,7 +795,7 @@ def str_extract(arr, pat, flags=0, expand=True): See Also -------- - extractall : returns all matches (not just the first match) + extractall : Returns all matches (not just the first match). Examples -------- @@ -977,7 +882,7 @@ def str_extractall(arr, pat, flags=0): See Also -------- - extract : returns first match only (not all matches) + extract : Returns first match only (not all matches). Examples -------- @@ -1026,7 +931,7 @@ def str_extractall(arr, pat, flags=0): if regex.groups == 0: raise ValueError("pattern contains no capture groups") - if isinstance(arr, ABCIndex): + if isinstance(arr, ABCIndexClass): arr = arr.to_series().reset_index(drop=True) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) @@ -1089,7 +994,7 @@ def str_get_dummies(arr, sep='|'): See Also -------- - pandas.get_dummies + get_dummies """ arr = arr.fillna('') try: @@ -1264,7 +1169,6 @@ def str_findall(arr, pat, flags=0): 1 [] 2 [b, b] dtype: object - """ regex = re.compile(pat, flags=flags) return _na_map(regex.findall, arr) @@ -1948,9 +1852,9 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result, use_codes=True, - name=None, expand=None): + name=None, expand=None, fill_value=np.nan): - from pandas.core.index import Index, MultiIndex + from pandas import Index, Series, MultiIndex # for category, we do the stuff on the categories, so blow it up # to the full series again @@ -1958,7 +1862,9 @@ def _wrap_result(self, result, use_codes=True, # so make it possible to skip this step as the method already did this # before the transformation... if use_codes and self._is_categorical: - result = take_1d(result, self._orig.cat.codes) + # if self._orig is a CategoricalIndex, there is no .cat-accessor + result = take_1d(result, Series(self._orig, copy=False).cat.codes, + fill_value=fill_value) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result @@ -2083,12 +1989,12 @@ def _get_series_list(self, others, ignore_index=False): elif isinstance(others, np.ndarray) and others.ndim == 2: others = DataFrame(others, index=idx) return ([others[x] for x in others], False) - elif is_list_like(others): + elif is_list_like(others, allow_sets=False): others = list(others) # ensure iterators do not get read twice etc # in case of list-like `others`, all elements must be # either one-dimensional list-likes or scalars - if all(is_list_like(x) for x in others): + if all(is_list_like(x, allow_sets=False) for x in others): los = [] join_warn = False depr_warn = False @@ -2161,9 +2067,10 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): If others is None, the method returns the concatenation of all strings in the calling Series/Index. - sep : string or None, default None - If None, concatenates without any separator. - na_rep : string or None, default None + sep : str, default '' + The separator between the different elements/columns. By default + the empty string `''` is used. + na_rep : str or None, default None Representation that is inserted for all missing values: - If `na_rep` is None, and `others` is None, missing values in the @@ -2190,8 +2097,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): See Also -------- - split : Split each string in the Series/Index - join : Join lists contained as elements in the Series/Index + split : Split each string in the Series/Index. + join : Join lists contained as elements in the Series/Index. Examples -------- @@ -2242,13 +2149,6 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): `join`-keyword works as in other methods. >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join=None, na_rep='-') - 0 ad - 1 ba - 2 -e - 3 dc - dtype: object - >>> >>> s.str.cat(t, join='left', na_rep='-') 0 aa 1 b- @@ -2283,6 +2183,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): if isinstance(others, compat.string_types): raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = '' if isinstance(self._orig, Index): data = Series(self._orig, index=self._orig) @@ -2291,9 +2193,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # concatenate Series/Index with itself if no "others" if others is None: - result = str_cat(data, others=others, sep=sep, na_rep=na_rep) - return self._wrap_result(result, - use_codes=(not self._is_categorical)) + data = ensure_object(data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) + return sep.join(data) try: # turn anything in "others" into lists of Series @@ -2320,23 +2226,47 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "'outer'|'inner'|'right'`. The future default will " "be `join='left'`.", FutureWarning, stacklevel=2) + # if join is None, _get_series_list already force-aligned indexes + join = 'left' if join is None else join + # align if required - if join is not None: + if any(not data.index.equals(x.index) for x in others): # Need to add keys for uniqueness in case of duplicate columns others = concat(others, axis=1, join=(join if join == 'inner' else 'outer'), - keys=range(len(others))) + keys=range(len(others)), sort=False, copy=False) data, others = data.align(others, join=join) others = [others[x] for x in others] # again list of Series - # str_cat discards index - res = str_cat(data, others=others, sep=sep, na_rep=na_rep) + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_core([x[not_masked] for x in all_cols], + sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [np.where(nm, na_rep, col) + for nm, col in zip(na_masks, all_cols)] + result = cat_core(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_core(all_cols, sep) if isinstance(self._orig, Index): - res = Index(res, name=self._orig.name) + # add dtype for case that result is all-NA + result = Index(result, dtype=object, name=self._orig.name) else: # Series - res = Series(res, index=data.index, name=self._orig.name) - return res + result = Series(result, dtype=object, index=data.index, + name=self._orig.name) + return result _shared_docs['str_split'] = (""" Split strings around given separator/delimiter. @@ -2479,8 +2409,11 @@ def rsplit(self, pat=None, n=-1, expand=False): Parameters ---------- - pat : str, default whitespace + sep : str, default whitespace String to split on. + pat : str, default whitespace + .. deprecated:: 0.24.0 + Use ``sep`` instead expand : bool, default True If True, return DataFrame/MultiIndex expanding dimensionality. If False, return Series/Index. @@ -2498,7 +2431,6 @@ def rsplit(self, pat=None, n=-1, expand=False): Examples -------- - >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) >>> s 0 Linda van der Berg @@ -2555,8 +2487,9 @@ def rsplit(self, pat=None, n=-1, expand=False): 'empty strings', 'also': 'rpartition : Split the string at the last occurrence of `sep`' }) - def partition(self, pat=' ', expand=True): - f = lambda x: x.partition(pat) + @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + def partition(self, sep=' ', expand=True): + f = lambda x: x.partition(sep) result = _na_map(f, self._parent) return self._wrap_result(result, expand=expand) @@ -2566,8 +2499,9 @@ def partition(self, pat=' ', expand=True): 'string itself', 'also': 'partition : Split the string at the first occurrence of `sep`' }) - def rpartition(self, pat=' ', expand=True): - f = lambda x: x.rpartition(pat) + @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + def rpartition(self, sep=' ', expand=True): + f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) return self._wrap_result(result, expand=expand) @@ -2585,12 +2519,12 @@ def join(self, sep): def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) - return self._wrap_result(result) + return self._wrap_result(result, fill_value=na) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result) + return self._wrap_result(result, fill_value=na) @copy(str_replace) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): @@ -2730,7 +2664,7 @@ def encode(self, encoding, errors="strict"): Parameters ---------- - to_strip : str or None, default None. + to_strip : str or None, default None Specifying the set of characters to be removed. All combinations of this set of characters will be stripped. If None then whitespaces are removed. @@ -2741,9 +2675,9 @@ def encode(self, encoding, errors="strict"): See Also -------- - Series.str.strip : Remove leading and trailing characters in Series/Index - Series.str.lstrip : Remove leading characters in Series/Index - Series.str.rstrip : Remove trailing characters in Series/Index + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. Examples -------- @@ -2873,7 +2807,8 @@ def rfind(self, sub, start=0, end=None): return self._wrap_result(result) def normalize(self, form): - """Return the Unicode normal form for the strings in the Series/Index. + """ + Return the Unicode normal form for the strings in the Series/Index. For more information on the forms, see the :func:`unicodedata.normalize`. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index eb8d2b0b6c809..86bb4e4b94382 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,36 +1,24 @@ -from functools import partial from datetime import datetime, time -from collections import MutableMapping +from functools import partial import numpy as np from pandas._libs import tslib, tslibs -from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs import parsing, conversion, Timestamp +from pandas._libs.tslibs import Timestamp, conversion, parsing from pandas._libs.tslibs.parsing import ( # noqa - parse_time_string, - DateParseError, - _format_is_iso, - _guess_datetime_format) + DateParseError, _format_is_iso, _guess_datetime_format, parse_time_string) +from pandas._libs.tslibs.strptime import array_strptime +from pandas.compat import zip from pandas.core.dtypes.common import ( - ensure_object, - is_datetime64_ns_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_integer_dtype, - is_integer, - is_float, - is_list_like, - is_scalar, - is_numeric_dtype, - is_object_dtype) -from pandas.core.dtypes.generic import ( - ABCIndexClass, ABCSeries, - ABCDataFrame) + ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype, + is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype, + is_list_like, is_numeric_dtype, is_object_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import notna + +from pandas import compat from pandas.core import algorithms -from pandas.compat import zip def _guess_datetime_format_for_array(arr, **kwargs): @@ -183,6 +171,9 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, - ndarray of Timestamps if box=False """ from pandas import DatetimeIndex + from pandas.core.arrays.datetimes import ( + maybe_convert_dtype, objects_to_datetime64ns) + if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -220,6 +211,11 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') + # warn if passing timedelta64, raise for PeriodDtype + # NB: this must come after unit transformation + orig_arg = arg + arg, _ = maybe_convert_dtype(arg, copy=False) + arg = ensure_object(arg) require_iso8601 = False @@ -236,14 +232,18 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, require_iso8601 = not infer_datetime_format format = None - try: - result = None + tz_parsed = None + result = None - if format is not None: + if format is not None: + try: # shortcut formatting here if format == '%Y%m%d': try: - result = _attempt_YYYYMMDD(arg, errors=errors) + # pass orig_arg as float-dtype may have been converted to + # datetime64[ns] + orig_arg = ensure_object(orig_arg) + result = _attempt_YYYYMMDD(orig_arg, errors=errors) except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): raise ValueError("cannot convert the input to " "'%Y%m%d' date format") @@ -268,45 +268,45 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, if errors == 'raise': raise result = arg - - if result is None and (format is None or infer_datetime_format): - result, tz_parsed = tslib.array_to_datetime( - arg, - errors=errors, - utc=tz == 'utc', - dayfirst=dayfirst, - yearfirst=yearfirst, - require_iso8601=require_iso8601 - ) - if tz_parsed is not None: - if box: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - return DatetimeIndex._simple_new(result, name=name, - tz=tz_parsed) - else: - # Convert the datetime64 numpy array to an numpy array - # of datetime objects - result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() - for ts in result] - return np.array(result, dtype=object) - + except ValueError as e: + # Fallback to try to convert datetime objects if timezone-aware + # datetime objects are found without passing `utc=True` + try: + values, tz = conversion.datetime_to_datetime64(arg) + return DatetimeIndex._simple_new(values, name=name, tz=tz) + except (ValueError, TypeError): + raise e + + if result is None: + assert format is None or infer_datetime_format + utc = tz == 'utc' + result, tz_parsed = objects_to_datetime64ns( + arg, dayfirst=dayfirst, yearfirst=yearfirst, + utc=utc, errors=errors, require_iso8601=require_iso8601, + allow_object=True) + + if tz_parsed is not None: if box: - # Ensure we return an Index in all cases where box=True - if is_datetime64_dtype(result): - return DatetimeIndex(result, tz=tz, name=name) - elif is_object_dtype(result): - # e.g. an Index of datetime objects - from pandas import Index - return Index(result, name=name) - return result + # We can take a shortcut since the datetime64 numpy array + # is in UTC + return DatetimeIndex._simple_new(result, name=name, + tz=tz_parsed) + else: + # Convert the datetime64 numpy array to an numpy array + # of datetime objects + result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() + for ts in result] + return np.array(result, dtype=object) - except ValueError as e: - try: - values, tz = conversion.datetime_to_datetime64(arg) - return DatetimeIndex._simple_new(values, name=name, tz=tz) - except (ValueError, TypeError): - raise e + if box: + # Ensure we return an Index in all cases where box=True + if is_datetime64_dtype(result): + return DatetimeIndex(result, tz=tz, name=name) + elif is_object_dtype(result): + # e.g. an Index of datetime objects + from pandas import Index + return Index(result, name=name) + return result def _adjust_to_origin(arg, origin, unit): @@ -543,7 +543,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 1 1960-01-03 2 1960-01-04 - See also + See Also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_timedelta : Convert argument to timedelta. @@ -570,7 +570,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, from pandas import Series values = convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) - elif isinstance(arg, (ABCDataFrame, MutableMapping)): + elif isinstance(arg, (ABCDataFrame, compat.MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, convert_listlike) @@ -702,9 +702,10 @@ def coerce(values): def _attempt_YYYYMMDD(arg, errors): - """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, - arg is a passed in as an object dtype, but could really be ints/strings - with nan-like/or floats (e.g. with nan) + """ + try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, + arg is a passed in as an object dtype, but could really be ints/strings + with nan-like/or floats (e.g. with nan) Parameters ---------- @@ -724,8 +725,9 @@ def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult[~mask] = tslibs.iNaT - result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)). \ - astype('M8[ns]') + + masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) + result[mask] = masked_result.astype('M8[ns]') return result # try intlike / strings that are ints diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 4bb5c223d1bcc..1d4973de92b99 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -1,16 +1,15 @@ import numpy as np -import pandas as pd -from pandas.core.dtypes.common import ( - is_scalar, - is_numeric_dtype, - is_decimal, - is_datetime_or_timedelta_dtype, - is_number, - ensure_object) -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass -from pandas.core.dtypes.cast import maybe_downcast_to_dtype + from pandas._libs import lib +from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import ( + ensure_object, is_datetime_or_timedelta_dtype, is_decimal, is_number, + is_numeric_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + +import pandas as pd + def to_numeric(arg, errors='raise', downcast=None): """ @@ -88,7 +87,7 @@ def to_numeric(arg, errors='raise', downcast=None): 3 -3.0 dtype: float64 - See also + See Also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_datetime : Convert argument to datetime. diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 4dc4fcb00d84d..6bcf56c306e6a 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -3,40 +3,57 @@ """ import numpy as np -import pandas as pd + from pandas._libs import tslibs -from pandas._libs.tslibs.timedeltas import (convert_to_timedelta64, - array_to_timedelta64) +from pandas._libs.tslibs.timedeltas import ( + convert_to_timedelta64, parse_timedelta_unit) -from pandas.core.dtypes.common import ( - ensure_object, - is_integer_dtype, - is_timedelta64_dtype, - is_list_like) -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + +import pandas as pd +from pandas.core.arrays.timedeltas import sequence_to_td64ns def to_timedelta(arg, unit='ns', box=True, errors='raise'): """ - Convert argument to timedelta + Convert argument to timedelta. + + Timedeltas are absolute differences in times, expressed in difference + units (e.g. days, hours, minutes, seconds). This method converts + an argument from a recognized timedelta format / value into + a Timedelta type. Parameters ---------- - arg : string, timedelta, list, tuple, 1-d array, or Series - unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an - integer/float number - box : boolean, default True - - If True returns a Timedelta/TimedeltaIndex of the results - - if False returns a np.timedelta64 or ndarray of values of dtype - timedelta64[ns] + arg : str, timedelta, list-like or Series + The data to be converted to timedelta. + unit : str, default 'ns' + Denotes the unit of the arg. Possible values: + ('Y', 'M', 'W', 'D', 'days', 'day', 'hours', hour', 'hr', + 'h', 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', + 'sec', 'second', 'ms', 'milliseconds', 'millisecond', + 'milli', 'millis', 'L', 'us', 'microseconds', 'microsecond', + 'micro', 'micros', 'U', 'ns', 'nanoseconds', 'nano', 'nanos', + 'nanosecond', 'N'). + box : bool, default True + - If True returns a Timedelta/TimedeltaIndex of the results. + - If False returns a numpy.timedelta64 or numpy.darray of + values of dtype timedelta64[ns]. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception - - If 'coerce', then invalid parsing will be set as NaT - - If 'ignore', then invalid parsing will return the input + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. Returns ------- - ret : timedelta64/arrays of timedelta64 if parsing succeeded + timedelta64 or numpy.array of timedelta64 + Output type returned if parsing succeeded. + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. Examples -------- @@ -64,12 +81,12 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) - See also - -------- - pandas.DataFrame.astype : Cast argument to a specified dtype. - pandas.to_datetime : Convert argument to datetime. + Returning an ndarray by using the 'box' keyword argument: + + >>> pd.to_timedelta(np.arange(5), box=False) + array([0, 1, 2, 3, 4], dtype='timedelta64[ns]') """ - unit = _validate_timedelta_unit(unit) + unit = parse_timedelta_unit(unit) if errors not in ('ignore', 'raise', 'coerce'): raise ValueError("errors must be one of 'ignore', " @@ -99,45 +116,6 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): box=box, errors=errors) -_unit_map = { - 'Y': 'Y', - 'y': 'Y', - 'W': 'W', - 'w': 'W', - 'D': 'D', - 'd': 'D', - 'days': 'D', - 'Days': 'D', - 'day': 'D', - 'Day': 'D', - 'M': 'M', - 'H': 'h', - 'h': 'h', - 'm': 'm', - 'T': 'm', - 'S': 's', - 's': 's', - 'L': 'ms', - 'MS': 'ms', - 'ms': 'ms', - 'US': 'us', - 'us': 'us', - 'NS': 'ns', - 'ns': 'ns', -} - - -def _validate_timedelta_unit(arg): - """ provide validation / translation for timedelta short units """ - try: - return _unit_map[arg] - except (KeyError, TypeError): - if arg is None: - return 'ns' - raise ValueError("invalid timedelta unit {arg} provided" - .format(arg=arg)) - - def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): """Convert string 'r' to a timedelta object.""" @@ -161,31 +139,27 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): - arg = np.array(list(arg), dtype='O') - - # these are shortcut-able - if is_timedelta64_dtype(arg): - value = arg.astype('timedelta64[ns]') - elif is_integer_dtype(arg): - value = arg.astype('timedelta64[{unit}]'.format(unit=unit)).astype( - 'timedelta64[ns]', copy=False) - else: - try: - value = array_to_timedelta64(ensure_object(arg), - unit=unit, errors=errors) - value = value.astype('timedelta64[ns]', copy=False) - except ValueError: - if errors == 'ignore': - return arg - else: - # This else-block accounts for the cases when errors='raise' - # and errors='coerce'. If errors == 'raise', these errors - # should be raised. If errors == 'coerce', we shouldn't - # expect any errors to be raised, since all parsing errors - # cause coercion to pd.NaT. However, if an error / bug is - # introduced that causes an Exception to be raised, we would - # like to surface it. - raise + # This is needed only to ensure that in the case where we end up + # returning arg (errors == "ignore"), and where the input is a + # generator, we return a useful list-like instead of a + # used-up generator + arg = np.array(list(arg), dtype=object) + + try: + value = sequence_to_td64ns(arg, unit=unit, + errors=errors, copy=False)[0] + except ValueError: + if errors == 'ignore': + return arg + else: + # This else-block accounts for the cases when errors='raise' + # and errors='coerce'. If errors == 'raise', these errors + # should be raised. If errors == 'coerce', we shouldn't + # expect any errors to be raised, since all parsing errors + # cause coercion to pd.NaT. However, if an error / bug is + # introduced that causes an Exception to be raised, we would + # like to surface it. + raise if box: from pandas import TimedeltaIndex diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index e62d70847437c..29fc1e3671a83 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -4,17 +4,15 @@ import itertools import numpy as np + from pandas._libs import hashing, tslibs -from pandas.core.dtypes.generic import ( - ABCMultiIndex, - ABCIndexClass, - ABCSeries, - ABCDataFrame) + +from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( - is_categorical_dtype, is_list_like) + is_categorical_dtype, is_extension_array_dtype, is_list_like) +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries) from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.cast import infer_dtype_from_scalar - # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -71,7 +69,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, Returns ------- Series of uint64, same length as the object - """ from pandas import Series if hash_key is None: @@ -149,7 +146,7 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): vals = MultiIndex.from_tuples(vals) # create a list-of-Categoricals - vals = [Categorical(vals.labels[level], + vals = [Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) @@ -250,7 +247,6 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): Returns ------- 1d uint64 numpy array of hash values, same length as the vals - """ if not hasattr(vals, 'dtype'): @@ -265,10 +261,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) + elif is_extension_array_dtype(dtype): + vals, _ = vals._values_for_factorize() + dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early - elif np.issubdtype(dtype, np.complex128): + if np.issubdtype(dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can diff --git a/pandas/core/window.py b/pandas/core/window.py index ea0ec79d655fb..6c4dde54bd061 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1,59 +1,43 @@ """ - -provide a generic structure to support window functions, -similar to how we have a Groupby object - - +Provide a generic structure to support window functions, +similar to how we have a Groupby object. """ from __future__ import division -import warnings -import numpy as np from collections import defaultdict from datetime import timedelta +from textwrap import dedent +import warnings + +import numpy as np + +import pandas._libs.window as libwindow +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.core.dtypes.generic import ( - ABCSeries, - ABCDataFrame, - ABCDatetimeIndex, - ABCTimedeltaIndex, - ABCPeriodIndex, - ABCDateOffset) from pandas.core.dtypes.common import ( - is_integer, - is_bool, - is_float_dtype, - is_integer_dtype, - needs_i8_conversion, - is_timedelta64_dtype, - is_list_like, - ensure_float64, - is_scalar) + ensure_float64, is_bool, is_float_dtype, is_integer, is_integer_dtype, + is_list_like, is_scalar, is_timedelta64_dtype, needs_i8_conversion) +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCPeriodIndex, ABCSeries, + ABCTimedeltaIndex) from pandas.core.base import PandasObject, SelectionMixin -from pandas.core.groupby.base import GroupByMixin import pandas.core.common as com -import pandas._libs.window as _window - -from pandas import compat -from pandas.compat.numpy import function as nv -from pandas.util._decorators import (Substitution, Appender, - cache_readonly) from pandas.core.generic import _shared_docs -from textwrap import dedent - +from pandas.core.groupby.base import GroupByMixin _shared_docs = dict(**_shared_docs) _doc_template = """ + Returns + ------- + same type as input -Returns -------- -same type as input - -See also --------- -pandas.Series.%(name)s -pandas.DataFrame.%(name)s + See Also + -------- + Series.%(name)s + DataFrame.%(name)s """ @@ -98,23 +82,26 @@ def is_freq_type(self): def validate(self): if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") - if self.min_periods is not None and not \ - is_integer(self.min_periods): + if (self.min_periods is not None and + not is_integer(self.min_periods)): raise ValueError("min_periods must be an integer") - if self.closed is not None and self.closed not in \ - ['right', 'both', 'left', 'neither']: + if (self.closed is not None and + self.closed not in ['right', 'both', 'left', 'neither']): raise ValueError("closed must be 'right', 'left', 'both' or " "'neither'") def _convert_freq(self): - """ resample according to the how, return a new object """ - + """ + Resample according to the how, return a new object. + """ obj = self._selected_obj index = None return obj, index def _create_blocks(self): - """ split data into blocks & return conformed data """ + """ + Split data into blocks & return conformed data. + """ obj, index = self._convert_freq() if index is not None: @@ -131,12 +118,11 @@ def _create_blocks(self): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- - key : string / list of selections + key : str / list of selections ndim : 1,2 requested ndim of result subset : object, default None @@ -173,7 +159,9 @@ def _window_type(self): return self.__class__.__name__ def __unicode__(self): - """ provide a nice str repr of our rolling object """ + """ + Provide a nice str repr of our rolling object. + """ attrs = ["{k}={v}".format(k=k, v=getattr(self, k)) for k in self._attributes @@ -187,7 +175,7 @@ def __iter__(self): def _get_index(self, index=None): """ - Return index as ndarrays + Return index as ndarrays. Returns ------- @@ -231,7 +219,9 @@ def _prep_values(self, values=None, kill_inf=True): return values def _wrap_result(self, result, block=None, obj=None): - """ wrap a single result """ + """ + Wrap a single result. + """ if obj is None: obj = self._selected_obj @@ -255,7 +245,7 @@ def _wrap_result(self, result, block=None, obj=None): def _wrap_results(self, results, blocks, obj): """ - wrap the results + Wrap the results. Parameters ---------- @@ -300,7 +290,9 @@ def _wrap_results(self, results, blocks, obj): return concat(final, axis=1).reindex(columns=columns, copy=False) def _center_window(self, result, window): - """ center the result in the window """ + """ + Center the result in the window. + """ if self.axis > result.ndim - 1: raise ValueError("Requested axis is larger then no. of argument " "dimensions") @@ -416,10 +408,10 @@ def aggregate(self, arg, *args, **kwargs): See Also -------- - Series.%(name)s : Calling object with Series data - DataFrame.%(name)s : Calling object with DataFrames - Series.mean : Equivalent method for Series - DataFrame.mean : Equivalent method for DataFrame + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.mean : Equivalent method for Series. + DataFrame.mean : Equivalent method for DataFrame. Examples -------- @@ -464,15 +456,16 @@ class Window(_Window): (otherwise result is NA). For a window that is specified by an offset, `min_periods` will default to 1. Otherwise, `min_periods` will default to the size of the window. - center : boolean, default False + center : bool, default False Set the labels at the center of the window. - win_type : string, default None + win_type : str, default None Provide a window type. If ``None``, all points are evenly weighted. See the notes below for further information. - on : string, optional + on : str, optional For a DataFrame, column on which to calculate the rolling window, rather than the index - closed : string, default None + axis : int or str, default 0 + closed : str, default None Make the interval closed on the 'right', 'left', 'both' or 'neither' endpoints. For offset-based windows, it defaults to 'right'. @@ -481,8 +474,6 @@ class Window(_Window): .. versionadded:: 0.20.0 - axis : int or string, default 0 - Returns ------- a Window or Rolling sub-classed for the particular operation @@ -548,7 +539,6 @@ class Window(_Window): 2013-01-01 09:00:05 NaN 2013-01-01 09:00:06 4.0 - Contrasting to an integer rolling window, this will roll a variable length window corresponding to the time period. The default for min_periods is 1. @@ -593,7 +583,7 @@ class Window(_Window): See Also -------- expanding : Provides expanding transformations. - ewm : Provides exponential weighted functions + ewm : Provides exponential weighted functions. """ def validate(self): @@ -620,8 +610,8 @@ def validate(self): def _prep_window(self, **kwargs): """ - provide validation for our window type, return the window - we have already been validated + Provide validation for our window type, return the window + we have already been validated. """ window = self._get_window() @@ -661,7 +651,7 @@ def _apply_window(self, mean=True, **kwargs): Parameters ---------- - mean : boolean, default True + mean : bool, default True If True computes weighted mean, else weighted sum Returns @@ -690,10 +680,10 @@ def _apply_window(self, mean=True, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, len(window)) - return _window.roll_window(np.concatenate((arg, - additional_nans)) - if center else arg, window, minp, - avg=mean) + return libwindow.roll_window(np.concatenate((arg, + additional_nans)) + if center else arg, window, minp, + avg=mean) result = np.apply_along_axis(f, self.axis, values) @@ -734,7 +724,7 @@ def f(arg, *args, **kwargs): 8 -0.096361 0.818139 0.472290 9 0.070889 0.134399 -0.031308 - See also + See Also -------- pandas.DataFrame.rolling.aggregate pandas.DataFrame.aggregate @@ -771,7 +761,9 @@ def mean(self, *args, **kwargs): class _GroupByMixin(GroupByMixin): - """ provide the groupby facilities """ + """ + Provide the groupby facilities. + """ def __init__(self, obj, *args, **kwargs): parent = kwargs.pop('parent', None) # noqa @@ -790,8 +782,8 @@ def __init__(self, obj, *args, **kwargs): def _apply(self, func, name, window=None, center=None, check_minp=None, **kwargs): """ - dispatch to apply; we are stripping all of the _apply kwargs and - performing the original function call on the grouped object + Dispatch to apply; we are stripping all of the _apply kwargs and + performing the original function call on the grouped object. """ def f(x, name=name, *args): @@ -814,16 +806,17 @@ def _constructor(self): def _apply(self, func, name=None, window=None, center=None, check_minp=None, **kwargs): """ - Rolling statistical measure using supplied function. Designed to be - used with passed-in Cython array-based functions. + Rolling statistical measure using supplied function. + + Designed to be used with passed-in Cython array-based functions. Parameters ---------- - func : string/callable to apply - name : string, optional + func : str/callable to apply + name : str, optional name of this function window : int/array, default to _get_window() - center : boolean, default to self.center + center : bool, default to self.center check_minp : function, default to _use_window Returns @@ -850,10 +843,10 @@ def _apply(self, func, name=None, window=None, center=None, # if we have a string function name, wrap it if isinstance(func, compat.string_types): - cfunc = getattr(_window, func, None) + cfunc = getattr(libwindow, func, None) if cfunc is None: raise ValueError("we do not support this function " - "in _window.{0}".format(func)) + "in libwindow.{func}".format(func=func)) def func(arg, window, min_periods=None, closed=None): minp = check_minp(min_periods, window) @@ -904,9 +897,9 @@ class _Rolling_and_Expanding(_Rolling): See Also -------- - pandas.Series.%(name)s : Calling object with Series data - pandas.DataFrame.%(name)s : Calling object with DataFrames - pandas.DataFrame.count : Count of the full DataFrame + pandas.Series.%(name)s : Calling object with Series data. + pandas.DataFrame.%(name)s : Calling object with DataFrames. + pandas.DataFrame.count : Count of the full DataFrame. Examples -------- @@ -951,7 +944,7 @@ def count(self): return self._wrap_results(results, blocks, obj) _shared_docs['apply'] = dedent(r""" - %(name)s function apply + %(name)s function apply. Parameters ---------- @@ -997,7 +990,7 @@ def f(arg, window, min_periods, closed): minp = _use_window(min_periods, window) if not raw: arg = Series(arg, index=self.obj.index) - return _window.roll_generic( + return libwindow.roll_generic( arg, window, minp, indexi, closed, offset, func, raw, args, kwargs) @@ -1009,7 +1002,7 @@ def sum(self, *args, **kwargs): return self._apply('roll_sum', 'sum', **kwargs) _shared_docs['max'] = dedent(""" - %(name)s maximum + Calculate the %(name)s maximum. """) def max(self, *args, **kwargs): @@ -1032,10 +1025,10 @@ def max(self, *args, **kwargs): See Also -------- - Series.%(name)s : Calling object with a Series - DataFrame.%(name)s : Calling object with a DataFrame - Series.min : Similar method for Series - DataFrame.min : Similar method for DataFrame + Series.%(name)s : Calling object with a Series. + DataFrame.%(name)s : Calling object with a DataFrame. + Series.min : Similar method for Series. + DataFrame.min : Similar method for DataFrame. Examples -------- @@ -1075,10 +1068,10 @@ def mean(self, *args, **kwargs): See Also -------- - Series.%(name)s : Calling object with Series data - DataFrame.%(name)s : Calling object with DataFrames - Series.median : Equivalent method for Series - DataFrame.median : Equivalent method for DataFrame + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.median : Equivalent method for Series. + DataFrame.median : Equivalent method for DataFrame. Examples -------- @@ -1118,11 +1111,11 @@ def median(self, **kwargs): See Also -------- - Series.%(name)s : Calling object with Series data - DataFrame.%(name)s : Calling object with DataFrames - Series.std : Equivalent method for Series - DataFrame.std : Equivalent method for DataFrame - numpy.std : Equivalent method for Numpy array + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.std : Equivalent method for Series. + DataFrame.std : Equivalent method for DataFrame. + numpy.std : Equivalent method for Numpy array. Notes ----- @@ -1162,8 +1155,8 @@ def std(self, ddof=1, *args, **kwargs): def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(_window.roll_var(arg, window, minp, indexi, - self.closed, ddof)) + return _zsqrt(libwindow.roll_var(arg, window, minp, indexi, + self.closed, ddof)) return self._apply(f, 'std', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -1189,11 +1182,11 @@ def f(arg, *args, **kwargs): See Also -------- - Series.%(name)s : Calling object with Series data - DataFrame.%(name)s : Calling object with DataFrames - Series.var : Equivalent method for Series - DataFrame.var : Equivalent method for DataFrame - numpy.var : Equivalent method for Numpy array + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.var : Equivalent method for Series. + DataFrame.var : Equivalent method for DataFrame. + numpy.var : Equivalent method for Numpy array. Notes ----- @@ -1256,12 +1249,12 @@ def skew(self, **kwargs): See Also -------- - Series.%(name)s : Calling object with Series data - DataFrame.%(name)s : Calling object with DataFrames - Series.kurt : Equivalent method for Series - DataFrame.kurt : Equivalent method for DataFrame - scipy.stats.skew : Third moment of a probability density - scipy.stats.kurtosis : Reference SciPy method + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.kurt : Equivalent method for Series. + DataFrame.kurt : Equivalent method for DataFrame. + scipy.stats.skew : Third moment of a probability density. + scipy.stats.kurtosis : Reference SciPy method. Notes ----- @@ -1273,7 +1266,7 @@ def kurt(self, **kwargs): check_minp=_require_min_periods(4), **kwargs) _shared_docs['quantile'] = dedent(""" - %(name)s quantile. + Calculate the %(name)s quantile. Parameters ---------- @@ -1333,36 +1326,38 @@ def quantile(self, quantile, interpolation='linear', **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) if quantile == 1.0: - return _window.roll_max(arg, window, minp, indexi, - self.closed) + return libwindow.roll_max(arg, window, minp, indexi, + self.closed) elif quantile == 0.0: - return _window.roll_min(arg, window, minp, indexi, - self.closed) + return libwindow.roll_min(arg, window, minp, indexi, + self.closed) else: - return _window.roll_quantile(arg, window, minp, indexi, - self.closed, quantile, - interpolation) + return libwindow.roll_quantile(arg, window, minp, indexi, + self.closed, quantile, + interpolation) return self._apply(f, 'quantile', quantile=quantile, **kwargs) - _shared_docs['cov'] = dedent(""" - %(name)s sample covariance + _shared_docs['cov'] = """ + Calculate the %(name)s sample covariance. - Parameters - ---------- - other : Series, DataFrame, or ndarray, optional - if not supplied then will default to self and produce pairwise output - pairwise : bool, default None - If False then only matching columns between self and other will be used - and the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the - output will be a MultiIndexed DataFrame in the case of DataFrame - inputs. In the case of missing elements, only complete pairwise - observations will be used. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements.""") + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): if other is None: @@ -1415,12 +1410,12 @@ def _get_cov(X, Y): See Also -------- - Series.%(name)s : Calling object with Series data - DataFrame.%(name)s : Calling object with DataFrames - Series.corr : Equivalent method for Series - DataFrame.corr : Equivalent method for DataFrame - %(name)s.cov : Similar method to calculate covariance - numpy.corrcoef : NumPy Pearson's correlation calculation + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.corr : Equivalent method for Series. + DataFrame.corr : Equivalent method for DataFrame. + %(name)s.cov : Similar method to calculate covariance. + numpy.corrcoef : NumPy Pearson's correlation calculation. Notes ----- @@ -1499,7 +1494,7 @@ def _get_cov(X, Y): Y 0.626300 1.000000 4 X 1.000000 0.555368 Y 0.555368 1.000000 -""") + """) def corr(self, other=None, pairwise=None, **kwargs): if other is None: @@ -1580,14 +1575,18 @@ def validate(self): "and offset based windows") def _validate_monotonic(self): - """ validate on is monotonic """ + """ + Validate on is_monotonic. + """ if not self._on.is_monotonic: formatted = self.on or 'index' raise ValueError("{0} must be " "monotonic".format(formatted)) def _validate_freq(self): - """ validate & return window frequency """ + """ + Validate & return window frequency. + """ from pandas.tseries.frequencies import to_offset try: return to_offset(self.window) @@ -1627,7 +1626,6 @@ def _validate_freq(self): 8 -0.289082 2.454418 1.416871 9 0.212668 0.403198 -0.093924 - >>> df.rolling(3).agg({'A':'sum', 'B':'min'}) A B 0 NaN NaN @@ -1641,11 +1639,10 @@ def _validate_freq(self): 8 -0.289082 -1.647453 9 0.212668 -1.647453 - See also + See Also -------- pandas.Series.rolling pandas.DataFrame.rolling - """) @Appender(_agg_doc) @@ -1776,7 +1773,7 @@ def corr(self, other=None, pairwise=None, **kwargs): class RollingGroupby(_GroupByMixin, Rolling): """ - Provides a rolling groupby implementation + Provides a rolling groupby implementation. .. versionadded:: 0.18.1 @@ -1797,10 +1794,10 @@ def _gotitem(self, key, ndim, subset=None): def _validate_monotonic(self): """ - validate that on is monotonic; + Validate that on is monotonic; we don't care for groupby.rolling because we have already validated at a higher - level + level. """ pass @@ -1816,9 +1813,9 @@ class Expanding(_Rolling_and_Expanding): min_periods : int, default 1 Minimum number of observations in window required to have a value (otherwise result is NA). - center : boolean, default False + center : bool, default False Set the labels at the center of the window. - axis : int or string, default 0 + axis : int or str, default 0 Returns ------- @@ -1850,8 +1847,8 @@ class Expanding(_Rolling_and_Expanding): See Also -------- - rolling : Provides rolling window calculations - ewm : Provides exponential weighted functions + rolling : Provides rolling window calculations. + ewm : Provides exponential weighted functions. """ _attributes = ['min_periods', 'center', 'axis'] @@ -1866,12 +1863,25 @@ def _constructor(self): return Expanding def _get_window(self, other=None): - obj = self._selected_obj - if other is None: - return (max(len(obj), self.min_periods) if self.min_periods - else len(obj)) - return (max((len(obj) + len(obj)), self.min_periods) - if self.min_periods else (len(obj) + len(obj))) + """ + Get the window length over which to perform some operation. + + Parameters + ---------- + other : object, default None + The other object that is involved in the operation. + Such an object is involved for operations like covariance. + + Returns + ------- + window : int + The window length. + """ + axis = self.obj._get_axis(self.axis) + length = len(axis) + (other is not None) * len(axis) + + other = self.min_periods or -1 + return max(length, other) _agg_doc = dedent(""" Examples @@ -1904,12 +1914,11 @@ def _get_window(self, other=None): 8 0.067236 0.948257 0.163353 9 -0.286980 0.618493 -0.694496 - See also + See Also -------- pandas.DataFrame.expanding.aggregate pandas.DataFrame.rolling.aggregate pandas.DataFrame.aggregate - """) @Appender(_agg_doc) @@ -2035,7 +2044,7 @@ def corr(self, other=None, pairwise=None, **kwargs): class ExpandingGroupby(_GroupByMixin, Expanding): """ - Provides a expanding groupby implementation + Provides a expanding groupby implementation. .. versionadded:: 0.18.1 @@ -2046,34 +2055,33 @@ def _constructor(self): _bias_template = """ - -Parameters ----------- -bias : boolean, default False - Use a standard estimation bias correction + Parameters + ---------- + bias : bool, default False + Use a standard estimation bias correction """ _pairwise_template = """ - -Parameters ----------- -other : Series, DataFrame, or ndarray, optional - if not supplied then will default to self and produce pairwise output -pairwise : bool, default None - If False then only matching columns between self and other will be used and - the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the output - will be a MultiIndex DataFrame in the case of DataFrame inputs. - In the case of missing elements, only complete pairwise observations will - be used. -bias : boolean, default False - Use a standard estimation bias correction + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + bias : bool, default False + Use a standard estimation bias correction """ class EWM(_Rolling): r""" - Provides exponential weighted functions + Provides exponential weighted functions. .. versionadded:: 0.18.0 @@ -2097,10 +2105,10 @@ class EWM(_Rolling): min_periods : int, default 0 Minimum number of observations in window required to have a value (otherwise result is NA). - adjust : boolean, default True + adjust : bool, default True Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings (viewing EWMA as a moving average) - ignore_na : boolean, default False + ignore_na : bool, default False Ignore missing values when calculating weights; specify True to reproduce pre-0.15.0 behavior @@ -2156,7 +2164,7 @@ class EWM(_Rolling): See Also -------- - rolling : Provides rolling window calculations + rolling : Provides rolling window calculations. expanding : Provides expanding transformations. """ _attributes = ['com', 'min_periods', 'adjust', 'ignore_na', 'axis'] @@ -2207,10 +2215,9 @@ def _constructor(self): 8 0.067236 0.948257 0.163353 9 -0.286980 0.618493 -0.694496 - See also + See Also -------- pandas.DataFrame.rolling.aggregate - """) @Appender(_agg_doc) @@ -2224,17 +2231,17 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate def _apply(self, func, **kwargs): - """Rolling statistical measure using supplied function. Designed to be + """ + Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. Parameters ---------- - func : string/callable to apply + func : str/callable to apply Returns ------- y : same type as input argument - """ blocks, obj, index = self._create_blocks() results = [] @@ -2251,10 +2258,10 @@ def _apply(self, func, **kwargs): # if we have a string function name, wrap it if isinstance(func, compat.string_types): - cfunc = getattr(_window, func, None) + cfunc = getattr(libwindow, func, None) if cfunc is None: raise ValueError("we do not support this function " - "in _window.{0}".format(func)) + "in libwindow.{func}".format(func=func)) def func(arg): return cfunc(arg, self.com, int(self.adjust), @@ -2267,7 +2274,9 @@ def func(arg): @Substitution(name='ewm') @Appender(_doc_template) def mean(self, *args, **kwargs): - """exponential weighted moving average""" + """ + Exponential weighted moving average. + """ nv.validate_window_func('mean', args, kwargs) return self._apply('ewma', **kwargs) @@ -2275,7 +2284,9 @@ def mean(self, *args, **kwargs): @Appender(_doc_template) @Appender(_bias_template) def std(self, bias=False, *args, **kwargs): - """exponential weighted moving stddev""" + """ + Exponential weighted moving stddev. + """ nv.validate_window_func('std', args, kwargs) return _zsqrt(self.var(bias=bias, **kwargs)) @@ -2285,13 +2296,15 @@ def std(self, bias=False, *args, **kwargs): @Appender(_doc_template) @Appender(_bias_template) def var(self, bias=False, *args, **kwargs): - """exponential weighted moving variance""" + """ + Exponential weighted moving variance. + """ nv.validate_window_func('var', args, kwargs) def f(arg): - return _window.ewmcov(arg, arg, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - int(bias)) + return libwindow.ewmcov(arg, arg, self.com, int(self.adjust), + int(self.ignore_na), int(self.min_periods), + int(bias)) return self._apply(f, **kwargs) @@ -2299,7 +2312,9 @@ def f(arg): @Appender(_doc_template) @Appender(_pairwise_template) def cov(self, other=None, pairwise=None, bias=False, **kwargs): - """exponential weighted sample covariance""" + """ + Exponential weighted sample covariance. + """ if other is None: other = self._selected_obj # only default unset @@ -2309,9 +2324,10 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = _window.ewmcov(X._prep_values(), Y._prep_values(), self.com, - int(self.adjust), int(self.ignore_na), - int(self.min_periods), int(bias)) + cov = libwindow.ewmcov(X._prep_values(), Y._prep_values(), + self.com, int(self.adjust), + int(self.ignore_na), int(self.min_periods), + int(bias)) return X._wrap_result(cov) return _flex_binary_moment(self._selected_obj, other._selected_obj, @@ -2321,7 +2337,9 @@ def _get_cov(X, Y): @Appender(_doc_template) @Appender(_pairwise_template) def corr(self, other=None, pairwise=None, **kwargs): - """exponential weighted sample correlation""" + """ + Exponential weighted sample correlation. + """ if other is None: other = self._selected_obj # only default unset @@ -2333,10 +2351,10 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return _window.ewmcov(x, y, self.com, int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - 1) + return libwindow.ewmcov(x, y, self.com, int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1) x_values = X._prep_values() y_values = Y._prep_values() @@ -2444,7 +2462,7 @@ def dataframe_from_int_dict(data, frame_template): # empty result result = DataFrame( index=MultiIndex(levels=[arg1.index, arg2.columns], - labels=[[], []]), + codes=[[], []]), columns=arg2.columns, dtype='float64') @@ -2461,9 +2479,8 @@ def dataframe_from_int_dict(data, frame_template): else: raise ValueError("'pairwise' is not True/False") else: - results = {} - for i, col in enumerate(arg1.columns): - results[i] = f(*_prep_binary(arg1.iloc[:, i], arg2)) + results = {i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + for i, col in enumerate(arg1.columns)} return dataframe_from_int_dict(results, arg1) else: diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 147c43b30d45f..eb6a4674a7497 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -26,7 +26,6 @@ class UnsortedIndexError(KeyError): and the index has not been lexsorted. Subclass of `KeyError`. .. versionadded:: 0.20.0 - """ @@ -133,7 +132,7 @@ class ParserWarning(Warning): >>> csv = u'''a;b;c ... 1;1,8 ... 1;2,1''' - >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') + >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') # doctest: +SKIP ... # ParserWarning: Falling back to the 'python' engine... Adding `engine='python'` to `pd.read_csv` removes the Warning: diff --git a/pandas/io/api.py b/pandas/io/api.py index f542a8176dce7..8c8d7cf73b37a 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -4,29 +4,17 @@ # flake8: noqa -from pandas.io.parsers import read_csv, read_table, read_fwf from pandas.io.clipboards import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel -from pandas.io.pytables import HDFStore, get_store, read_hdf -from pandas.io.json import read_json -from pandas.io.html import read_html -from pandas.io.sql import read_sql, read_sql_table, read_sql_query -from pandas.io.sas import read_sas from pandas.io.feather_format import read_feather +from pandas.io.gbq import read_gbq +from pandas.io.html import read_html +from pandas.io.json import read_json +from pandas.io.packers import read_msgpack, to_msgpack from pandas.io.parquet import read_parquet -from pandas.io.stata import read_stata +from pandas.io.parsers import read_csv, read_fwf, read_table from pandas.io.pickle import read_pickle, to_pickle -from pandas.io.packers import read_msgpack, to_msgpack -from pandas.io.gbq import read_gbq - -# deprecation, xref #13790 -def Term(*args, **kwargs): - import warnings - - warnings.warn("pd.Term is deprecated as it is not " - "applicable to user code. Instead use in-line " - "string expressions in the where clause when " - "searching in HDFStore", - FutureWarning, stacklevel=2) - from pandas.io.pytables import Term - return Term(*args, **kwargs) +from pandas.io.pytables import HDFStore, read_hdf +from pandas.io.sas import read_sas +from pandas.io.sql import read_sql, read_sql_query, read_sql_table +from pandas.io.stata import read_stata diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py index 0793ca6877cdb..d6d0ba0a560bb 100644 --- a/pandas/io/clipboard/clipboards.py +++ b/pandas/io/clipboard/clipboards.py @@ -1,7 +1,9 @@ import subprocess -from .exceptions import PyperclipException + from pandas.compat import PY2, text_type +from .exceptions import PyperclipException + EXCEPT_MSG = """ Pyperclip could not find a copy/paste mechanism for your system. For more information, please visit https://pyperclip.readthedocs.org """ diff --git a/pandas/io/clipboard/windows.py b/pandas/io/clipboard/windows.py index 5fc23f7102f41..3d979a61b5f2d 100644 --- a/pandas/io/clipboard/windows.py +++ b/pandas/io/clipboard/windows.py @@ -1,10 +1,11 @@ """ This module implements clipboard handling on Windows using ctypes. """ -import time import contextlib import ctypes -from ctypes import c_size_t, sizeof, c_wchar_p, get_errno, c_wchar +from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof +import time + from .exceptions import PyperclipWindowsException diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 0d564069c681f..23a2b04214e4e 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -1,10 +1,12 @@ """ io on the clipboard """ import warnings -from pandas.compat import StringIO, PY2, PY3 +import pandas.compat as compat +from pandas.compat import PY2, PY3, StringIO from pandas.core.dtypes.generic import ABCDataFrame -from pandas import compat, get_option, option_context + +from pandas import get_option, option_context def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover @@ -14,7 +16,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover Parameters ---------- - sep : str, default '\s+'. + sep : str, default '\s+' A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. @@ -42,7 +44,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover text, encoding=(kwargs.get('encoding') or get_option('display.encoding')) ) - except: + except AttributeError: pass # Excel copies into clipboard with \t separation diff --git a/pandas/io/common.py b/pandas/io/common.py index 405911eda7e9e..3a67238a66450 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,21 +1,21 @@ """Common IO api utilities""" -import os -import csv import codecs +from contextlib import closing, contextmanager +import csv import mmap -from contextlib import contextmanager, closing +import os import zipfile -from pandas.compat import StringIO, BytesIO, string_types, text_type -from pandas import compat -from pandas.io.formats.printing import pprint_thing -import pandas.core.common as com -from pandas.core.dtypes.common import is_number, is_file_like +import pandas.compat as compat +from pandas.compat import BytesIO, StringIO, string_types, text_type +from pandas.errors import ( # noqa + AbstractMethodError, DtypeWarning, EmptyDataError, ParserError, + ParserWarning) + +from pandas.core.dtypes.common import is_file_like, is_number -# compat -from pandas.errors import (ParserError, DtypeWarning, # noqa - EmptyDataError, ParserWarning) +from pandas.io.formats.printing import pprint_thing # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -66,7 +66,7 @@ def __iter__(self): return self def __next__(self): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) if not compat.PY3: @@ -417,21 +417,22 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, elif is_path: if compat.PY2: # Python 2 + mode = "wb" if mode == "w" else mode f = open(path_or_buf, mode) elif encoding: # Python 3 and encoding - f = open(path_or_buf, mode, encoding=encoding) + f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # Python 3 and no explicit encoding - f = open(path_or_buf, mode, errors='replace') + f = open(path_or_buf, mode, errors='replace', newline="") else: # Python 3 and binary mode f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding - if compat.PY3 and is_text and\ - (compression or isinstance(f, need_text_wrapping)): + if (compat.PY3 and is_text and + (compression or isinstance(f, need_text_wrapping))): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 377373f8a0135..1a22ee7240d59 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,7 +1,8 @@ """This module is designed for community supported date conversion functions""" -from pandas.compat import range, map import numpy as np + from pandas._libs.tslibs import parsing +from pandas.compat import map, range def parse_date_time(date_col, time_col): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 00b4c704c681b..03d873467dc10 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -4,37 +4,34 @@ # --------------------------------------------------------------------- # ExcelFile class -from datetime import datetime, date, time, MINYEAR, timedelta - -import os import abc -import warnings -from textwrap import fill -from io import UnsupportedOperation +from datetime import date, datetime, time, timedelta from distutils.version import LooseVersion +from io import UnsupportedOperation +import os +from textwrap import fill +import warnings import numpy as np import pandas._libs.json as json -from pandas.util._decorators import Appender, deprecate_kwarg -from pandas.errors import EmptyDataError - import pandas.compat as compat -from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, - string_types, OrderedDict) +from pandas.compat import ( + OrderedDict, add_metaclass, lrange, map, range, string_types, u, zip) +from pandas.errors import EmptyDataError +from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( - is_integer, is_float, - is_bool, is_list_like) + is_bool, is_float, is_integer, is_list_like) from pandas.core import config from pandas.core.frame import DataFrame -from pandas.io.parsers import TextParser -from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, - get_filepath_or_buffer, _NA_VALUES, - _stringify_path) +from pandas.io.common import ( + _NA_VALUES, _is_url, _stringify_path, _urlopen, _validate_header_arg, + get_filepath_or_buffer) from pandas.io.formats.printing import pprint_thing +from pandas.io.parsers import TextParser __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] @@ -95,13 +92,26 @@ .. deprecated:: 0.21.0 Pass in `usecols` instead. -usecols : int or list, default None - * If None then parse all columns, - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be parsed - * If string then indicates comma separated list of Excel column letters and - column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of +usecols : int, str, list-like, or callable default None + * If None, then parse all columns, + * If int, then indicates last column to be parsed + + .. deprecated:: 0.24.0 + Pass in a list of ints instead from 0 to `usecols` inclusive. + + * If string, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. + * If list of ints, then indicates list of column numbers to be parsed. + * If list of strings, then indicates list of column names to be parsed. + + .. versionadded:: 0.24.0 + + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + .. versionadded:: 0.24.0 + squeeze : boolean, default False If the parsed data only contains one column then return a Series dtype : Type name or dict of column -> type, default None @@ -112,7 +122,7 @@ .. versionadded:: 0.20.0 -engine: string, default None +engine : string, default None If io is not a buffer or path, this must be set to identify io. Acceptable values are None or xlrd converters : dict, default None @@ -165,12 +175,16 @@ convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally +mangle_dupe_cols : boolean, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. Returns ------- parsed : DataFrame or Dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheet_name - argument for more information on when a Dict of Dataframes is returned. + DataFrame from the passed in Excel file. See notes in sheet_name + argument for more information on when a dict of DataFrames is returned. Examples -------- @@ -304,6 +318,7 @@ def read_excel(io, comment=None, skipfooter=0, convert_float=True, + mangle_dupe_cols=True, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning @@ -339,6 +354,7 @@ def read_excel(io, comment=comment, skipfooter=skipfooter, convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, **kwds) @@ -352,22 +368,21 @@ class ExcelFile(object): io : string, path object (pathlib.Path or py._path.local.LocalPath), file-like object or xlrd workbook If a string or path object, expected to be a path to xls or xlsx file - engine: string, default None + engine : string, default None If io is not a buffer or path, this must be set to identify io. Acceptable values are None or xlrd """ def __init__(self, io, **kwds): - err_msg = "Install xlrd >= 0.9.0 for Excel support" + err_msg = "Install xlrd >= 1.0.0 for Excel support" try: import xlrd except ImportError: raise ImportError(err_msg) else: - ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) - if ver < (0, 9): # pragma: no cover + if xlrd.__VERSION__ < LooseVersion("1.0.0"): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) @@ -431,6 +446,7 @@ def parse(self, comment=None, skipfooter=0, convert_float=True, + mangle_dupe_cols=True, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -466,41 +482,9 @@ def parse(self, comment=comment, skipfooter=skipfooter, convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, **kwds) - def _should_parse(self, i, usecols): - - def _range2cols(areas): - """ - Convert comma separated list of column names and column ranges to a - list of 0-based column indexes. - - >>> _range2cols('A:E') - [0, 1, 2, 3, 4] - >>> _range2cols('A,C,Z:AB') - [0, 2, 25, 26, 27] - """ - def _excel2num(x): - "Convert Excel column name like 'AB' to 0-based column index" - return reduce(lambda s, a: s * 26 + ord(a) - ord('A') + 1, - x.upper().strip(), 0) - 1 - - cols = [] - for rng in areas.split(','): - if ':' in rng: - rng = rng.split(':') - cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1) - else: - cols.append(_excel2num(rng)) - return cols - - if isinstance(usecols, int): - return i <= usecols - elif isinstance(usecols, compat.string_types): - return i in _range2cols(usecols) - else: - return i in usecols - def _parse_excel(self, sheet_name=0, header=0, @@ -521,6 +505,7 @@ def _parse_excel(self, comment=None, skipfooter=0, convert_float=True, + mangle_dupe_cols=True, **kwds): _validate_header_arg(header) @@ -529,11 +514,6 @@ def _parse_excel(self, raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") - if parse_dates is True and index_col is None: - warnings.warn("The 'parse_dates=True' keyword of read_excel was " - "provided without an 'index_col' keyword value.") - - import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) @@ -546,36 +526,23 @@ def _parse_cell(cell_contents, cell_typ): if cell_typ == XL_CELL_DATE: - if xlrd_0_9_3: - # Use the newer xlrd datetime handling. - try: - cell_contents = \ - xldate.xldate_as_datetime(cell_contents, - epoch1904) - except OverflowError: - return cell_contents - # Excel doesn't distinguish between dates and time, - # so we treat dates on the epoch as times only. - # Also, Excel supports 1900 and 1904 epochs. - year = (cell_contents.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) or - (epoch1904 and year == (1904, 1, 1))): - cell_contents = time(cell_contents.hour, - cell_contents.minute, - cell_contents.second, - cell_contents.microsecond) - else: - # Use the xlrd <= 0.9.2 date handling. - try: - dt = xldate.xldate_as_tuple(cell_contents, epoch1904) - - except xldate.XLDateTooLarge: - return cell_contents - - if dt[0] < MINYEAR: - cell_contents = time(*dt[3:]) - else: - cell_contents = datetime(*dt) + # Use the newer xlrd datetime handling. + try: + cell_contents = xldate.xldate_as_datetime( + cell_contents, epoch1904) + except OverflowError: + return cell_contents + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (cell_contents.timetuple())[0:3] + if ((not epoch1904 and year == (1899, 12, 31)) or + (epoch1904 and year == (1904, 1, 1))): + cell_contents = time(cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan @@ -589,12 +556,6 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - # xlrd >= 0.9.3 can return datetime objects directly. - if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): - xlrd_0_9_3 = True - else: - xlrd_0_9_3 = False - ret_dict = False # Keep sheetname to maintain backwards compatibility. @@ -622,17 +583,12 @@ def _parse_cell(cell_contents, cell_typ): sheet = self.book.sheet_by_index(asheetname) data = [] - should_parse = {} + usecols = _maybe_convert_usecols(usecols) for i in range(sheet.nrows): - row = [] - for j, (value, typ) in enumerate(zip(sheet.row_values(i), - sheet.row_types(i))): - if usecols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, usecols) - - if usecols is None or should_parse[j]: - row.append(_parse_cell(value, typ)) + row = [_parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), + sheet.row_types(i))] data.append(row) if sheet.nrows == 0: @@ -644,42 +600,46 @@ def _parse_cell(cell_contents, cell_typ): # forward fill and pull out names for MultiIndex column header_names = None - if header is not None: - if is_list_like(header): - header_names = [] - control_row = [True] * len(data[0]) - for row in header: - if is_integer(skiprows): - row += skiprows - - data[row], control_row = _fill_mi_header( - data[row], control_row) - header_name, data[row] = _pop_header_name( - data[row], index_col) + if header is not None and is_list_like(header): + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + row += skiprows + + data[row], control_row = _fill_mi_header(data[row], + control_row) + + if index_col is not None: + header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) - else: - data[header] = _trim_excel_header(data[header]) if is_list_like(index_col): - # forward fill values for MultiIndex index + # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) - for col in index_col: - last = data[offset][col] - for row in range(offset + 1, len(data)): - if data[row][col] == '' or data[row][col] is None: - data[row][col] = last - else: - last = data[row][col] + # Check if we have an empty dataset + # before trying to collect data. + if offset < len(data): + for col in index_col: + last = data[offset][col] + + for row in range(offset + 1, len(data)): + if data[row][col] == '' or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser(data, + names=names, header=header, index_col=index_col, has_index_names=has_index_names, @@ -695,14 +655,20 @@ def _parse_cell(cell_contents, cell_typ): thousands=thousands, comment=comment, skipfooter=skipfooter, + usecols=usecols, + mangle_dupe_cols=mangle_dupe_cols, **kwds) output[asheetname] = parser.read(nrows=nrows) - if names is not None: - output[asheetname].columns = names + if not squeeze or isinstance(output[asheetname], DataFrame): - output[asheetname].columns = output[ - asheetname].columns.set_names(header_names) + if header_names: + output[asheetname].columns = output[ + asheetname].columns.set_names(header_names) + elif compat.PY2: + output[asheetname].columns = _maybe_convert_to_string( + output[asheetname].columns) + except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() @@ -728,6 +694,101 @@ def __exit__(self, exc_type, exc_value, traceback): self.close() +def _excel2num(x): + """ + Convert Excel column name like 'AB' to 0-based column index. + + Parameters + ---------- + x : str + The Excel column name to convert to a 0-based column index. + + Returns + ------- + num : int + The column index corresponding to the name. + + Raises + ------ + ValueError + Part of the Excel column name was invalid. + """ + index = 0 + + for c in x.upper().strip(): + cp = ord(c) + + if cp < ord("A") or cp > ord("Z"): + raise ValueError("Invalid column name: {x}".format(x=x)) + + index = index * 26 + cp - ord("A") + 1 + + return index - 1 + + +def _range2cols(areas): + """ + Convert comma separated list of column names and ranges to indices. + + Parameters + ---------- + areas : str + A string containing a sequence of column ranges (or areas). + + Returns + ------- + cols : list + A list of 0-based column indices. + + Examples + -------- + >>> _range2cols('A:E') + [0, 1, 2, 3, 4] + >>> _range2cols('A,C,Z:AB') + [0, 2, 25, 26, 27] + """ + cols = [] + + for rng in areas.split(","): + if ":" in rng: + rng = rng.split(":") + cols.extend(lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1)) + else: + cols.append(_excel2num(rng)) + + return cols + + +def _maybe_convert_usecols(usecols): + """ + Convert `usecols` into a compatible format for parsing in `parsers.py`. + + Parameters + ---------- + usecols : object + The use-columns object to potentially convert. + + Returns + ------- + converted : object + The compatible format of `usecols`. + """ + if usecols is None: + return usecols + + if is_integer(usecols): + warnings.warn(("Passing in an integer for `usecols` has been " + "deprecated. Please pass in a list of ints from " + "0 to `usecols` inclusive instead."), + FutureWarning, stacklevel=2) + return lrange(usecols + 1) + + if isinstance(usecols, compat.string_types): + return _range2cols(usecols) + + return usecols + + def _validate_freeze_panes(freeze_panes): if freeze_panes is not None: if ( @@ -752,6 +813,39 @@ def _trim_excel_header(row): return row +def _maybe_convert_to_string(row): + """ + Convert elements in a row to string from Unicode. + + This is purely a Python 2.x patch and is performed ONLY when all + elements of the row are string-like. + + Parameters + ---------- + row : array-like + The row of data to convert. + + Returns + ------- + converted : array-like + """ + if compat.PY2: + converted = [] + + for i in range(len(row)): + if isinstance(row[i], compat.string_types): + try: + converted.append(str(row[i])) + except UnicodeEncodeError: + break + else: + break + else: + row = converted + + return row + + def _fill_mi_header(row, control_row): """Forward fills blank entries in row, but only inside the same parent index @@ -780,22 +874,36 @@ def _fill_mi_header(row, control_row): control_row[i] = False last = row[i] - return row, control_row + return _maybe_convert_to_string(row), control_row # fill blank if index_col not None def _pop_header_name(row, index_col): - """ (header, new_data) for header rows in MultiIndex parsing""" - none_fill = lambda x: None if x == '' else x + """ + Pop the header name for MultiIndex parsing. + + Parameters + ---------- + row : list + The data row to parse for the header name. + index_col : int, list + The index columns for our data. Assumed to be non-null. + + Returns + ------- + header_name : str + The extracted header name. + trimmed_row : list + The original data row with the header name removed. + """ + # Pop out header name and fill w/blank. + i = index_col if not is_list_like(index_col) else max(index_col) + + header_name = row[i] + header_name = None if header_name == "" else header_name - if index_col is None: - # no index col specified, trim data for inference path - return none_fill(row[0]), row[1:] - else: - # pop out header name and fill w/ blank - i = index_col if not is_list_like(index_col) else max(index_col) - return none_fill(row[i]), row[:i] + [''] + row[i + 1:] + return header_name, row[:i] + [''] + row[i + 1:] @add_metaclass(abc.ABCMeta) @@ -934,8 +1042,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, cell of formatted data to save to Excel sheet sheet_name : string, default None Name of Excel sheet, if None, then use self.cur_sheet - startrow: upper left cell row to dump data frame - startcol: upper left cell column to dump data frame + startrow : upper left cell row to dump data frame + startcol : upper left cell column to dump data frame freeze_panes: integer tuple of length 2 contains the bottom-most row and right-most column to freeze """ @@ -1082,7 +1190,7 @@ def _convert_to_style(cls, style_dict): converts a style_dict to an openpyxl style object Parameters ---------- - style_dict: style dictionary to convert + style_dict : style dictionary to convert """ from openpyxl.style import Style @@ -1610,8 +1718,8 @@ def _convert_to_style(cls, style_dict, num_format_str=None): converts a style_dict to an xlwt style object Parameters ---------- - style_dict: style dictionary to convert - num_format_str: optional number format string + style_dict : style dictionary to convert + num_format_str : optional number format string """ import xlwt @@ -1713,8 +1821,8 @@ def convert(cls, style_dict, num_format_str=None): Parameters ---------- - style_dict: style dictionary to convert - num_format_str: optional number format string + style_dict : style dictionary to convert + num_format_str : optional number format string """ # Create a XlsxWriter format object. @@ -1755,14 +1863,14 @@ def convert(cls, style_dict, num_format_str=None): props[k] = ['none', 'thin', 'medium', 'dashed', 'dotted', 'thick', 'double', 'hair', 'mediumDashed', 'dashDot', 'mediumDashDot', 'dashDotDot', - 'mediumDashDotDot', 'slantDashDot'].\ - index(props[k]) + 'mediumDashDotDot', + 'slantDashDot'].index(props[k]) except ValueError: props[k] = 2 if isinstance(props.get('font_script'), string_types): - props['font_script'] = ['baseline', 'superscript', 'subscript'].\ - index(props['font_script']) + props['font_script'] = ['baseline', 'superscript', + 'subscript'].index(props['font_script']) if isinstance(props.get('underline'), string_types): props['underline'] = {'none': 0, 'single': 1, 'double': 2, diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 1bc6526214a91..5c8ab37c7c917 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,37 +1,37 @@ """ feather-format compat """ from distutils.version import LooseVersion -from pandas import DataFrame, RangeIndex, Int64Index + from pandas.compat import range +from pandas.util._decorators import deprecate_kwarg + +from pandas import DataFrame, Int64Index, RangeIndex + from pandas.io.common import _stringify_path def _try_import(): - # since pandas is a dependency of feather + # since pandas is a dependency of pyarrow # we need to import on first use - try: - import feather + import pyarrow + from pyarrow import feather except ImportError: - # give a nice error message - raise ImportError("the feather-format library is not installed\n" + raise ImportError("pyarrow is not installed\n\n" "you can install via conda\n" - "conda install feather-format -c conda-forge\n" + "conda install pyarrow -c conda-forge\n" "or via pip\n" - "pip install -U feather-format\n") + "pip install -U pyarrow\n") - try: - LooseVersion(feather.__version__) >= LooseVersion('0.3.1') - except AttributeError: - raise ImportError("the feather-format library must be >= " - "version 0.3.1\n" + if LooseVersion(pyarrow.__version__) < LooseVersion('0.4.1'): + raise ImportError("pyarrow >= 0.4.1 required for feather support\n\n" "you can install via conda\n" - "conda install feather-format -c conda-forge" + "conda install pyarrow -c conda-forge" "or via pip\n" - "pip install -U feather-format\n") + "pip install -U pyarrow\n") - return feather + return feather, pyarrow def to_feather(df, path): @@ -48,7 +48,7 @@ def to_feather(df, path): if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") - feather = _try_import() + feather = _try_import()[0] valid_types = {'string', 'unicode'} # validate index @@ -80,10 +80,11 @@ def to_feather(df, path): if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_dataframe(df, path) + feather.write_feather(df, path) -def read_feather(path, nthreads=1): +@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads') +def read_feather(path, columns=None, use_threads=True): """ Load a feather-format object from the file path @@ -92,10 +93,19 @@ def read_feather(path, nthreads=1): Parameters ---------- path : string file path, or file-like object + columns : sequence, default None + If not provided, all columns are read + + .. versionadded 0.24.0 nthreads : int, default 1 Number of CPU threads to use when reading to pandas.DataFrame .. versionadded 0.21.0 + .. deprecated 0.24.0 + use_threads : bool, default True + Whether to parallelize reading using multiple threads + + .. versionadded 0.24.0 Returns ------- @@ -103,10 +113,15 @@ def read_feather(path, nthreads=1): """ - feather = _try_import() + feather, pyarrow = _try_import() path = _stringify_path(path) - if LooseVersion(feather.__version__) < LooseVersion('0.4.0'): - return feather.read_dataframe(path) + if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'): + int_use_threads = int(use_threads) + if int_use_threads < 1: + int_use_threads = 1 + return feather.read_feather(path, columns=columns, + nthreads=int_use_threads) - return feather.read_dataframe(path, nthreads=nthreads) + return feather.read_feather(path, columns=columns, + use_threads=bool(use_threads)) diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index b8b28a0b0c98c..64168dd7db1b8 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -2,8 +2,9 @@ Internal module for console introspection """ -import sys import locale +import sys + from pandas.io.formats.terminal import get_terminal_size # ----------------------------------------------------------------------------- @@ -100,7 +101,7 @@ def check_main(): try: return __IPYTHON__ or check_main() # noqa - except: + except NameError: return check_main() @@ -118,7 +119,7 @@ def in_qtconsole(): ip.config.get('IPKernelApp', {}).get('parent_appname', "")) if 'qtconsole' in front_end.lower(): return True - except: + except NameError: return False return False @@ -137,7 +138,7 @@ def in_ipnb(): ip.config.get('IPKernelApp', {}).get('parent_appname', "")) if 'notebook' in front_end.lower(): return True - except: + except NameError: return False return False @@ -149,7 +150,7 @@ def in_ipython_frontend(): try: ip = get_ipython() # noqa return 'zmq' in str(type(ip)).lower() - except: + except NameError: pass return False diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 9faac6cd09218..46c843af043e7 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,29 +5,24 @@ from __future__ import print_function -import warnings - import csv as csvlib +import os +import warnings from zipfile import ZipFile import numpy as np from pandas._libs import writers as libwriters - -from pandas import compat from pandas.compat import StringIO, range, zip -from pandas.core.dtypes.missing import notna from pandas.core.dtypes.generic import ( - ABCMultiIndex, ABCPeriodIndex, ABCDatetimeIndex, ABCIndexClass) + ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex) +from pandas.core.dtypes.missing import notna + +from pandas import compat from pandas.io.common import ( - _expand_user, - _get_handle, - _infer_compression, - _stringify_path, - UnicodeWriter, -) + UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer) class CSVFormatter(object): @@ -45,7 +40,9 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if path_or_buf is None: path_or_buf = StringIO() - self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) + self.path_or_buf, _, _, _ = get_filepath_or_buffer( + path_or_buf, encoding=encoding, compression=compression, mode=mode + ) self.sep = sep self.na_rep = na_rep self.float_format = float_format @@ -72,7 +69,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.doublequote = doublequote self.escapechar = escapechar - self.line_terminator = line_terminator + self.line_terminator = line_terminator or os.linesep self.date_format = date_format diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index d6fcfb2207cf9..d74722996a660 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -1,23 +1,24 @@ """Utilities for conversion to writer-agnostic Excel representation """ +import itertools import re import warnings -import itertools import numpy as np from pandas.compat import reduce -import pandas.core.common as com -from pandas.core.dtypes.common import is_float, is_scalar from pandas.core.dtypes import missing +from pandas.core.dtypes.common import is_float, is_scalar from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex + from pandas import Index +import pandas.core.common as com from pandas.io.formats.css import CSSResolver, CSSWarning -from pandas.io.formats.printing import pprint_thing from pandas.io.formats.format import get_level_lengths +from pandas.io.formats.printing import pprint_thing class ExcelCell(object): @@ -430,9 +431,9 @@ def _format_header_mi(self): name = columns.names[lnum] yield ExcelCell(lnum, coloffset, name, self.header_style) - for lnum, (spans, levels, labels) in enumerate(zip( - level_lengths, columns.levels, columns.labels)): - values = levels.take(labels) + for lnum, (spans, levels, level_codes) in enumerate(zip( + level_lengths, columns.levels, columns.codes)): + values = levels.take(level_codes) for i in spans: if spans[i] > 1: yield ExcelCell(lnum, coloffset + i + 1, values[i], @@ -573,11 +574,11 @@ def _format_hierarchical_rows(self): names=False) level_lengths = get_level_lengths(level_strs) - for spans, levels, labels in zip(level_lengths, - self.df.index.levels, - self.df.index.labels): + for spans, levels, level_codes in zip(level_lengths, + self.df.index.levels, + self.df.index.codes): - values = levels.take(labels, + values = levels.take(level_codes, allow_fill=levels._can_hold_na, fill_value=True) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index db86409adc2b0..8452eb562a8e6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -5,45 +5,37 @@ """ from __future__ import print_function -# pylint: disable=W0141 from functools import partial import numpy as np from pandas._libs import lib -from pandas._libs.tslibs import NaT, iNaT, Timestamp, Timedelta from pandas._libs.tslib import format_array_from_datetime +from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT +from pandas.compat import StringIO, lzip, map, u, zip -from pandas import compat -from pandas.compat import StringIO, lzip, map, zip, u - -from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_float_dtype, - is_period_arraylike, - is_integer_dtype, - is_interval_dtype, - is_datetimetz, - is_integer, - is_float, - is_scalar, - is_numeric_dtype, - is_datetime64_dtype, - is_timedelta64_dtype, - is_list_like) -from pandas.core.dtypes.generic import ABCSparseArray, ABCMultiIndex + is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_extension_array_dtype, is_float, is_float_dtype, is_integer, + is_integer_dtype, is_list_like, is_numeric_dtype, is_scalar, + is_timedelta64_dtype) +from pandas.core.dtypes.generic import ( + ABCIndexClass, ABCMultiIndex, ABCSeries, ABCSparseArray) +from pandas.core.dtypes.missing import isna, notna + +from pandas import compat from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.index import Index, ensure_index from pandas.core.config import get_option, set_option +from pandas.core.index import Index, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex -from pandas.io.formats.terminal import get_terminal_size from pandas.io.common import _expand_user, _stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing +from pandas.io.formats.terminal import get_terminal_size + +# pylint: disable=W0141 common_docstring = """ @@ -96,6 +88,10 @@ Maximum number of columns to display in the console. show_dimensions : bool, default False Display DataFrame dimensions (number of rows by number of columns). + decimal : str, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + + .. versionadded:: 0.18.0 """ _VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify", @@ -109,8 +105,6 @@ String representation of the dataframe. """ -docstring_to_string = common_docstring + return_docstring - class CategoricalFormatter(object): @@ -616,11 +610,6 @@ def to_string(self): else: # max_cols == 0. Try to fit frame to terminal text = self.adj.adjoin(1, *strcols).split('\n') max_len = Series(text).str.len().max() - headers = [ele[0] for ele in strcols] - # Size of last col determines dot col size. See - # `self._to_str_columns - size_tr_col = len(headers[self.tr_size_col]) - max_len += size_tr_col # Need to make space for largest row # plus truncate dot col dif = max_len - self.w # '+ 1' to avoid too wide repr (GH PR #17023) @@ -741,12 +730,8 @@ def to_html(self, classes=None, notebook=False, border=None): .. versionadded:: 0.19.0 """ from pandas.io.formats.html import HTMLFormatter - html_renderer = HTMLFormatter(self, classes=classes, - max_rows=self.max_rows, - max_cols=self.max_cols, - notebook=notebook, - border=border, - table_id=self.table_id) + html_renderer = HTMLFormatter(self, classes=classes, notebook=notebook, + border=border, table_id=self.table_id) if hasattr(self.buf, 'write'): html_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): @@ -857,22 +842,18 @@ def _get_column_name_list(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.'): - if is_categorical_dtype(values): - fmt_klass = CategoricalArrayFormatter - elif is_interval_dtype(values): - fmt_klass = IntervalArrayFormatter + if is_datetime64_dtype(values.dtype): + fmt_klass = Datetime64Formatter + elif is_timedelta64_dtype(values.dtype): + fmt_klass = Timedelta64Formatter + elif is_extension_array_dtype(values.dtype): + fmt_klass = ExtensionArrayFormatter elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter - elif is_period_arraylike(values): - fmt_klass = PeriodArrayFormatter elif is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter - elif is_datetimetz(values): + elif is_datetime64tz_dtype(values): fmt_klass = Datetime64TZFormatter - elif is_datetime64_dtype(values.dtype): - fmt_klass = Datetime64Formatter - elif is_timedelta64_dtype(values.dtype): - fmt_klass = Timedelta64Formatter else: fmt_klass = GenericArrayFormatter @@ -971,6 +952,8 @@ def __init__(self, *args, **kwargs): # float_format is expected to be a string # formatter should be used to pass a function if self.float_format is not None and self.formatter is None: + # GH21625, GH22270 + self.fixed_width = False if callable(self.float_format): self.formatter = self.float_format self.float_format = None @@ -1134,39 +1117,22 @@ def _format_strings(self): return fmt_values.tolist() -class IntervalArrayFormatter(GenericArrayFormatter): - - def __init__(self, values, *args, **kwargs): - GenericArrayFormatter.__init__(self, values, *args, **kwargs) - - def _format_strings(self): - formatter = self.formatter or str - fmt_values = np.array([formatter(x) for x in self.values]) - return fmt_values - - -class PeriodArrayFormatter(IntArrayFormatter): - +class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self): - from pandas.core.indexes.period import IncompatibleFrequency - try: - values = PeriodIndex(self.values).to_native_types() - except IncompatibleFrequency: - # periods may contains different freq - values = Index(self.values, dtype='object').to_native_types() - - formatter = self.formatter or (lambda x: '{x}'.format(x=x)) - fmt_values = [formatter(x) for x in values] - return fmt_values - + values = self.values + if isinstance(values, (ABCIndexClass, ABCSeries)): + values = values._values -class CategoricalArrayFormatter(GenericArrayFormatter): + formatter = values._formatter(boxed=True) - def __init__(self, values, *args, **kwargs): - GenericArrayFormatter.__init__(self, values, *args, **kwargs) + if is_categorical_dtype(values.dtype): + # Categorical is special for now, so that we can preserve tzinfo + array = values.get_values() + else: + array = np.asarray(values) - def _format_strings(self): - fmt_values = format_array(self.values.get_values(), self.formatter, + fmt_values = format_array(array, + formatter, float_format=self.float_format, na_rep=self.na_rep, digits=self.digits, space=self.space, justify=self.justify) @@ -1257,7 +1223,10 @@ def _format_datetime64(x, tz=None, nat_rep='NaT'): return nat_rep if tz is not None or not isinstance(x, Timestamp): - x = Timestamp(x, tz=tz) + if getattr(x, 'tzinfo', None) is not None: + x = Timestamp(x).tz_convert(tz) + else: + x = Timestamp(x).tz_localize(tz) return str(x) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index a6b03c9c6dd23..6425e655959bd 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -4,30 +4,28 @@ """ from __future__ import print_function -from distutils.version import LooseVersion from textwrap import dedent -from pandas import compat -from pandas.compat import (lzip, range, map, zip, u, - OrderedDict, unichr) +from pandas.compat import OrderedDict, lzip, map, range, u, unichr, zip -import pandas.core.common as com from pandas.core.dtypes.generic import ABCMultiIndex + +from pandas import compat +import pandas.core.common as com from pandas.core.config import get_option +from pandas.io.formats.format import ( + TableFormatter, buffer_put_lines, get_level_lengths) from pandas.io.formats.printing import pprint_thing -from pandas.io.formats.format import (get_level_lengths, - buffer_put_lines) -from pandas.io.formats.format import TableFormatter class HTMLFormatter(TableFormatter): indent_delta = 2 - def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, - notebook=False, border=None, table_id=None): + def __init__(self, formatter, classes=None, notebook=False, border=None, + table_id=None): self.fmt = formatter self.classes = classes @@ -36,18 +34,21 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, self.elements = [] self.bold_rows = self.fmt.kwds.get('bold_rows', False) self.escape = self.fmt.kwds.get('escape', True) - - self.max_rows = max_rows or len(self.fmt.frame) - self.max_cols = max_cols or len(self.fmt.columns) self.show_dimensions = self.fmt.show_dimensions - self.is_truncated = (self.max_rows < len(self.fmt.frame) or - self.max_cols < len(self.fmt.columns)) self.notebook = notebook if border is None: border = get_option('display.html.border') self.border = border self.table_id = table_id + @property + def is_truncated(self): + return self.fmt.is_truncated + + @property + def ncols(self): + return len(self.fmt.tr_frame.columns) + def write(self, s, indent=0): rs = pprint_thing(s) self.elements.append(' ' * indent + rs) @@ -79,7 +80,7 @@ def _write_cell(self, s, kind='td', indent=0, tags=None): self.write(u'{start}{rs}' .format(start=start_tag, rs=rs, kind=kind), indent) - def write_tr(self, line, indent=0, indent_delta=4, header=False, + def write_tr(self, line, indent=0, indent_delta=0, header=False, align=None, tags=None, nindex_levels=0): if tags is None: tags = {} @@ -159,15 +160,7 @@ def write_result(self, buf): _classes.extend(self.classes) if self.notebook: - div_style = '' - try: - import IPython - if IPython.__version__ < LooseVersion('3.0.0'): - div_style = ' style="max-width:1500px;overflow:auto;"' - except (ImportError, AttributeError): - pass - - self.write(''.format(style=div_style)) + self.write('
') self.write_style() @@ -201,26 +194,6 @@ def _write_header(self, indent): # write nothing return indent - def _column_header(): - if self.fmt.index: - row = [''] * (self.frame.index.nlevels - 1) - else: - row = [] - - if isinstance(self.columns, ABCMultiIndex): - if self.fmt.has_column_names and self.fmt.index: - row.append(single_column_table(self.columns.names)) - else: - row.append('') - style = "text-align: {just};".format(just=self.fmt.justify) - row.extend([single_column_table(c, self.fmt.justify, style) - for c in self.columns]) - else: - if self.fmt.index: - row.append(self.columns.name or '') - row.extend(self.columns) - return row - self.write('
', indent) indent += self.indent_delta @@ -302,25 +275,28 @@ def _column_header(): self.write_tr(row, indent, self.indent_delta, tags=tags, header=True) else: - col_row = _column_header() + if self.fmt.index: + row = [''] * (self.frame.index.nlevels - 1) + row.append(self.columns.name or '') + else: + row = [] + row.extend(self.columns) align = self.fmt.justify if truncate_h: + if not self.fmt.index: + row_levels = 0 ins_col = row_levels + self.fmt.tr_col_num - col_row.insert(ins_col, '...') + row.insert(ins_col, '...') - self.write_tr(col_row, indent, self.indent_delta, header=True, + self.write_tr(row, indent, self.indent_delta, header=True, align=align) if all((self.fmt.has_index_names, self.fmt.index, self.fmt.show_index_names)): - row = ([x if x is not None else '' - for x in self.frame.index.names] + - [''] * min(len(self.columns), self.max_cols)) - if truncate_h: - ins_col = row_levels + self.fmt.tr_col_num - row.insert(ins_col, '') + row = ([x if x is not None else '' for x in self.frame.index.names] + + [''] * (self.ncols + (1 if truncate_h else 0))) self.write_tr(row, indent, self.indent_delta, header=True) indent -= self.indent_delta @@ -332,20 +308,13 @@ def _write_body(self, indent): self.write('', indent) indent += self.indent_delta - fmt_values = {} - for i in range(min(len(self.columns), self.max_cols)): - fmt_values[i] = self.fmt._format_col(i) + fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} # write values - if self.fmt.index: - if isinstance(self.frame.index, ABCMultiIndex): - self._write_hierarchical_rows(fmt_values, indent) - else: - self._write_regular_rows(fmt_values, indent) + if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): + self._write_hierarchical_rows(fmt_values, indent) else: - for i in range(min(len(self.frame), self.max_rows)): - row = [fmt_values[j][i] for j in range(len(self.columns))] - self.write_tr(row, indent, self.indent_delta, tags=None) + self._write_regular_rows(fmt_values, indent) indent -= self.indent_delta self.write('', indent) @@ -357,13 +326,17 @@ def _write_regular_rows(self, fmt_values, indent): truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v - ncols = len(self.fmt.tr_frame.columns) nrows = len(self.fmt.tr_frame) - fmt = self.fmt._get_formatter('__index__') - if fmt is not None: - index_values = self.fmt.tr_frame.index.map(fmt) + + if self.fmt.index: + fmt = self.fmt._get_formatter('__index__') + if fmt is not None: + index_values = self.fmt.tr_frame.index.map(fmt) + else: + index_values = self.fmt.tr_frame.index.format() + row_levels = 1 else: - index_values = self.fmt.tr_frame.index.format() + row_levels = 0 row = [] for i in range(nrows): @@ -371,17 +344,18 @@ def _write_regular_rows(self, fmt_values, indent): if truncate_v and i == (self.fmt.tr_row_num): str_sep_row = ['...'] * len(row) self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=1) + tags=None, nindex_levels=row_levels) row = [] - row.append(index_values[i]) - row.extend(fmt_values[j][i] for j in range(ncols)) + if self.fmt.index: + row.append(index_values[i]) + row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: - dot_col_ix = self.fmt.tr_col_num + 1 + dot_col_ix = self.fmt.tr_col_num + row_levels row.insert(dot_col_ix, '...') self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=1) + nindex_levels=row_levels) def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="{span}" valign="top"' @@ -389,7 +363,6 @@ def _write_hierarchical_rows(self, fmt_values, indent): truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v frame = self.fmt.tr_frame - ncols = len(frame.columns) nrows = len(frame) row_levels = self.frame.index.nlevels @@ -467,7 +440,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): j += 1 row.append(v) - row.extend(fmt_values[j][i] for j in range(ncols)) + row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: row.insert(row_levels - sparse_offset + self.fmt.tr_col_num, '...') @@ -479,29 +452,8 @@ def _write_hierarchical_rows(self, fmt_values, indent): sparsify=False, adjoin=False, names=False))) row = [] row.extend(idx_values[i]) - row.extend(fmt_values[j][i] for j in range(ncols)) + row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: row.insert(row_levels + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=None, nindex_levels=frame.index.nlevels) - - -def single_column_table(column, align=None, style=None): - table = ''.format(i=i)) - table += '
{i!s}
' - return table - - -def single_row_table(row): # pragma: no cover - table = '' - for i in row: - table += (''.format(i=i)) - table += '
{i!s}
' - return table diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index fbbad763dd97b..90be3364932a2 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -6,11 +6,12 @@ import numpy as np -from pandas import compat -from pandas.compat import range, map, zip, u +from pandas.compat import map, range, u, zip from pandas.core.dtypes.generic import ABCMultiIndex +from pandas import compat + from pandas.io.formats.format import TableFormatter diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index e22d7bce42841..6d45d1e5dfcee 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -3,9 +3,12 @@ """ import sys + +from pandas.compat import u + from pandas.core.dtypes.inference import is_sequence + from pandas import compat -from pandas.compat import u from pandas.core.config import get_option @@ -107,10 +110,10 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): nitems = max_seq_items or get_option("max_seq_items") or len(seq) s = iter(seq) - r = [] - for i in range(min(nitems, len(seq))): # handle sets, no slicing - r.append(pprint_thing( - next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) + # handle sets, no slicing + r = [pprint_thing(next(s), + _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) + for i in range(min(nitems, len(seq)))] body = ", ".join(r) if nitems < len(seq): @@ -268,7 +271,8 @@ class TableSchemaFormatter(BaseFormatter): max_seq_items=max_seq_items) -def format_object_summary(obj, formatter, is_justify=True, name=None): +def format_object_summary(obj, formatter, is_justify=True, name=None, + indent_for_name=True): """ Return the formatted obj as a unicode string @@ -280,8 +284,11 @@ def format_object_summary(obj, formatter, is_justify=True, name=None): string formatter for an element is_justify : boolean should justify the display - name : name, optiona + name : name, optional defaults to the class name of the obj + indent_for_name : bool, default True + Whether subsequent lines should be be indented to + align with the name. Returns ------- @@ -297,8 +304,13 @@ def format_object_summary(obj, formatter, is_justify=True, name=None): if name is None: name = obj.__class__.__name__ - space1 = "\n%s" % (' ' * (len(name) + 1)) - space2 = "\n%s" % (' ' * (len(name) + 2)) + if indent_for_name: + name_len = len(name) + space1 = "\n%s" % (' ' * (name_len + 1)) + space2 = "\n%s" % (' ' * (name_len + 2)) + else: + space1 = "\n" + space2 = "\n " # space for the opening '[' n = len(obj) sep = ',' @@ -325,15 +337,17 @@ def best_len(values): else: return 0 + close = u', ' + if n == 0: - summary = '[], ' + summary = u'[]{}'.format(close) elif n == 1: first = formatter(obj[0]) - summary = '[%s], ' % first + summary = u'[{}]{}'.format(first, close) elif n == 2: first = formatter(obj[0]) last = formatter(obj[-1]) - summary = '[%s, %s], ' % (first, last) + summary = u'[{}, {}]{}'.format(first, last, close) else: if n > max_seq_items: @@ -378,7 +392,11 @@ def best_len(values): summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2) summary += line - summary += '],' + + # right now close is either '' or ', ' + # Now we want to include the ']', but not the maybe space. + close = ']' + close.rstrip(' ') + summary += close if len(summary) > (display_width): summary += space1 diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index f4bb53ba4f218..4fdcb978b4695 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -2,35 +2,37 @@ Module for applying conditional formatting to DataFrames and Series. """ +from collections import MutableMapping, defaultdict +from contextlib import contextmanager +import copy from functools import partial from itertools import product -from contextlib import contextmanager from uuid import uuid1 -import copy -from collections import defaultdict, MutableMapping -try: - from jinja2 import ( - PackageLoader, Environment, ChoiceLoader, FileSystemLoader - ) -except ImportError: - msg = "pandas.Styler requires jinja2. "\ - "Please install with `conda install Jinja2`\n"\ - "or `pip install Jinja2`" - raise ImportError(msg) +import numpy as np + +from pandas.compat import range +from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_float, is_string_like +from pandas.core.dtypes.generic import ABCSeries -import numpy as np import pandas as pd from pandas.api.types import is_list_like -from pandas.compat import range +import pandas.core.common as com from pandas.core.config import get_option from pandas.core.generic import _shared_docs -import pandas.core.common as com from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice -from pandas.util._decorators import Appender -from pandas.core.dtypes.generic import ABCSeries + +try: + from jinja2 import ( + PackageLoader, Environment, ChoiceLoader, FileSystemLoader + ) +except ImportError: + raise ImportError("pandas.Styler requires jinja2. " + "Please install with `conda install Jinja2`\n" + "or `pip install Jinja2`") + try: import matplotlib.pyplot as plt @@ -51,20 +53,24 @@ def _mpl(func): class Styler(object): """ - Helps style a DataFrame or Series according to the - data with HTML and CSS. + Helps style a DataFrame or Series according to the data with HTML and CSS. Parameters ---------- - data: Series or DataFrame - precision: int + data : Series or DataFrame + precision : int precision to round floats to, defaults to pd.options.display.precision - table_styles: list-like, default None + table_styles : list-like, default None list of {selector: (attr, value)} dicts; see Notes - uuid: str, default None + uuid : str, default None a unique identifier to avoid CSS collisions; generated automatically - caption: str, default None + caption : str, default None caption to attach to the table + cell_ids : bool, default True + If True, each cell will have an ``id`` attribute in their HTML tag. + The ``id`` takes the form ``T__row_col`` + where ```` is the unique identifier, ```` is the row + number and ```` is the column number. Attributes ---------- @@ -113,7 +119,7 @@ class Styler(object): template = env.get_template("html.tpl") def __init__(self, data, precision=None, table_styles=None, uuid=None, - caption=None, table_attributes=None): + caption=None, table_attributes=None, cell_ids=True): self.ctx = defaultdict(list) self._todo = [] @@ -137,6 +143,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, self.table_attributes = table_attributes self.hidden_index = False self.hidden_columns = [] + self.cell_ids = cell_ids # display_funcs maps (row, col) -> formatting function @@ -149,7 +156,9 @@ def default_display_func(x): self._display_funcs = defaultdict(lambda: default_display_func) def _repr_html_(self): - """Hooks into Jupyter notebook rich display system.""" + """ + Hooks into Jupyter notebook rich display system. + """ return self.render() @Appender(_shared_docs['to_excel'] % dict( @@ -179,7 +188,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def _translate(self): """ Convert the DataFrame in `self.data` and the attrs from `_build_styles` - into a dictionary of {head, body, uuid, cellstyle} + into a dictionary of {head, body, uuid, cellstyle}. """ table_styles = self.table_styles or [] caption = self.caption @@ -307,14 +316,16 @@ def format_attr(pair): cs.extend(cell_context.get("data", {}).get(r, {}).get(c, [])) formatter = self._display_funcs[(r, c)] value = self.data.iloc[r, c] - row_es.append({ - "type": "td", - "value": value, - "class": " ".join(cs), - "id": "_".join(cs[1:]), - "display_value": formatter(value), - "is_visible": (c not in hidden_columns) - }) + row_dict = {"type": "td", + "value": value, + "class": " ".join(cs), + "display_value": formatter(value), + "is_visible": (c not in hidden_columns)} + # only add an id if the cell has a style + if (self.cell_ids or + not(len(ctx[r, c]) == 1 and ctx[r, c][0] == '')): + row_dict["id"] = "_".join(cs[1:]) + row_es.append(row_dict) props = [] for x in ctx[r, c]: # have to handle empty styles like [''] @@ -349,8 +360,8 @@ def format(self, formatter, subset=None): Parameters ---------- - formatter: str, callable, or dict - subset: IndexSlice + formatter : str, callable, or dict + subset : IndexSlice An argument to ``DataFrame.loc`` that restricts which elements ``formatter`` is applied to. @@ -407,21 +418,20 @@ def format(self, formatter, subset=None): return self def render(self, **kwargs): - """Render the built up styles to HTML + """ + Render the built up styles to HTML. Parameters ---------- - `**kwargs`: - Any additional keyword arguments are passed through - to ``self.template.render``. This is useful when you - need to provide additional variables for a custom - template. + `**kwargs` : Any additional keyword arguments are passed through + to ``self.template.render``. This is useful when you need to provide + additional variables for a custom template. .. versionadded:: 0.20 Returns ------- - rendered: str + rendered : str the rendered HTML Notes @@ -459,10 +469,11 @@ def render(self, **kwargs): def _update_ctx(self, attrs): """ - update the state of the Styler. Collects a mapping - of {index_label: [': ']} + Update the state of the Styler. + + Collects a mapping of {index_label: [': ']}. - attrs: Series or DataFrame + attrs : Series or DataFrame should contain strings of ': ;: ' Whitespace shouldn't matter and the final trailing ';' shouldn't matter. @@ -496,7 +507,8 @@ def __deepcopy__(self, memo): return self._copy(deepcopy=True) def clear(self): - """"Reset" the styler, removing any previously applied styles. + """ + Reset the styler, removing any previously applied styles. Returns None. """ self.ctx.clear() @@ -629,7 +641,6 @@ def applymap(self, func, subset=None, **kwargs): See Also -------- Styler.where - """ self._todo.append((lambda instance: getattr(instance, '_applymap'), (func, subset), kwargs)) @@ -664,7 +675,6 @@ def where(self, cond, value, other=None, subset=None, **kwargs): See Also -------- Styler.applymap - """ if other is None: @@ -679,7 +689,7 @@ def set_precision(self, precision): Parameters ---------- - precision: int + precision : int Returns ------- @@ -690,9 +700,10 @@ def set_precision(self, precision): def set_table_attributes(self, attributes): """ - Set the table attributes. These are the items - that show up in the opening ```` tag in addition - to to automatic (by default) id. + Set the table attributes. + + These are the items that show up in the opening ``
`` tag + in addition to to automatic (by default) id. Parameters ---------- @@ -714,11 +725,12 @@ def set_table_attributes(self, attributes): def export(self): """ Export the styles to applied to the current Styler. + Can be applied to a second style with ``Styler.use``. Returns ------- - styles: list + styles : list See Also -------- @@ -733,7 +745,7 @@ def use(self, styles): Parameters ---------- - styles: list + styles : list list of style functions Returns @@ -753,7 +765,7 @@ def set_uuid(self, uuid): Parameters ---------- - uuid: str + uuid : str Returns ------- @@ -768,7 +780,7 @@ def set_caption(self, caption): Parameters ---------- - caption: str + caption : str Returns ------- @@ -779,12 +791,13 @@ def set_caption(self, caption): def set_table_styles(self, table_styles): """ - Set the table styles on a Styler. These are placed in a - ``