From 1a056df275ea602976c1dd3f8ccbe8295c9bd042 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 31 Dec 2022 17:56:19 +0000 Subject: [PATCH 01/13] Revert "STYLE use pandas-dev-flaker (#40906)" This reverts commit 1f27ed0adc2fea2dd301d2dccab5215afaca0e6b. --- .pre-commit-config.yaml | 96 +++- asv_bench/benchmarks/pandas_vb_common.py | 2 +- ci/code_checks.sh | 71 ++- doc/source/development/code_style.rst | 218 ++++++++ environment.yml | 1 - pandas/_testing/__init__.py | 2 +- pandas/tests/api/test_api.py | 2 +- pandas/tests/io/pytables/test_store.py | 2 +- pandas/tests/tools/test_to_numeric.py | 4 +- requirements-dev.txt | 1 - ...check_for_inconsistent_pandas_namespace.py | 142 +++++ .../test_inconsistent_namespace_check.py | 61 +++ scripts/tests/test_use_pd_array_in_core.py | 2 +- .../tests/test_validate_unwanted_patterns.py | 419 +++++++++++++++ scripts/validate_unwanted_patterns.py | 487 ++++++++++++++++++ 15 files changed, 1497 insertions(+), 13 deletions(-) create mode 100644 doc/source/development/code_style.rst create mode 100644 scripts/check_for_inconsistent_pandas_namespace.py create mode 100644 scripts/tests/test_inconsistent_namespace_check.py create mode 100644 scripts/tests/test_validate_unwanted_patterns.py create mode 100755 scripts/validate_unwanted_patterns.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f3158e64df8dd..f33353d8f6a38 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -70,12 +70,9 @@ repos: rev: 6.0.0 hooks: - id: flake8 - # Need to patch os.remove rule in pandas-dev-flaker - exclude: ^ci/fix_wheels.py additional_dependencies: &flake8_dependencies - flake8==6.0.0 - flake8-bugbear==22.7.1 - - pandas-dev-flaker==0.5.0 - repo: https://github.com/pycqa/pylint rev: v2.15.6 hooks: @@ -184,6 +181,28 @@ repos: types: [rst] args: [--filename=*.rst] additional_dependencies: [flake8-rst==0.7.0, flake8==3.7.9] + - id: frame-or-series-union + name: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias + entry: Union\[.*(Series,.*DataFrame|DataFrame,.*Series).*\] + language: pygrep + types: [python] + exclude: ^pandas/_typing\.py$ + - id: inconsistent-namespace-usage + name: 'Check for inconsistent use of pandas namespace' + entry: python scripts/check_for_inconsistent_pandas_namespace.py + language: python + types: [python] + - id: no-os-remove + name: Check code for instances of os.remove + entry: os\.remove + language: pygrep + types: [python] + files: ^pandas/tests/ + exclude: | + (?x)^ + pandas/tests/io/excel/test_writers\.py + |pandas/tests/io/pytables/common\.py + |pandas/tests/io/pytables/test_store\.py$ - id: unwanted-patterns name: Unwanted patterns language: pygrep @@ -193,6 +212,24 @@ repos: \#\ type:\ (?!ignore) |\#\ type:\s?ignore(?!\[) + # foo._class__ instead of type(foo) + |\.__class__ + + # np.bool/np.object instead of np.bool_/np.object_ + |np\.bool[^_8] + |np\.object[^_8] + + # imports from pandas.core.common instead of `import pandas.core.common as com` + |from\ pandas\.core\.common\ import + |from\ pandas\.core\ import\ common + + # imports from collections.abc instead of `from collections import abc` + |from\ collections\.abc\ import + + # Numpy + |from\ numpy\ import\ random + |from\ numpy\.random\ import + # Incorrect code-block / IPython directives |\.\.\ code-block\ :: |\.\.\ ipython\ :: @@ -202,6 +239,7 @@ repos: # Check for deprecated messages without sphinx directive |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) types_or: [python, cython, rst] + exclude: ^doc/source/development/code_style\.rst # contains examples of patterns to avoid - id: cython-casting name: Check Cython casting is `obj`, not ` obj` language: pygrep @@ -232,6 +270,29 @@ repos: files: ^pandas/tests/extension/base types: [python] exclude: ^pandas/tests/extension/base/base\.py + - id: unwanted-patterns-in-tests + name: Unwanted patterns in tests + language: pygrep + entry: | + (?x) + # pytest.xfail instead of pytest.mark.xfail + pytest\.xfail + + # imports from pandas._testing instead of `import pandas._testing as tm` + |from\ pandas\._testing\ import + |from\ pandas\ import\ _testing\ as\ tm + + # No direct imports from conftest + |conftest\ import + |import\ conftest + + # pandas.testing instead of tm + |pd\.testing\. + + # pd.api.types instead of from pandas.api.types import ... + |(pd|pandas)\.api\.types\. + files: ^pandas/tests/ + types_or: [python, cython, rst] - id: pip-to-conda name: Generate pip dependency from conda language: python @@ -252,6 +313,35 @@ repos: language: python types: [rst] files: ^doc/source/(development|reference)/ + - id: unwanted-patterns-bare-pytest-raises + name: Check for use of bare pytest raises + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises" + types: [python] + files: ^pandas/tests/ + exclude: ^pandas/tests/extension/ + - id: unwanted-patterns-private-function-across-module + name: Check for use of private functions across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" + types: [python] + exclude: ^(asv_bench|pandas/tests|doc)/ + - id: unwanted-patterns-private-import-across-module + name: Check for import of private attributes across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" + types: [python] + exclude: ^(asv_bench|pandas/tests|doc)/ + - id: unwanted-patterns-strings-to-concatenate + name: Check for use of not concatenated strings + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" + types_or: [python, cython] + - id: unwanted-patterns-strings-with-wrong-placed-whitespace + name: Check for strings with wrong placed spaces + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" + types_or: [python, cython] - id: use-pd_array-in-core name: Import pandas.array as pd_array in core language: python diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index d3168bde0a783..97d91111e833a 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -70,7 +70,7 @@ class BaseIO: def remove(self, f): """Remove created files""" try: - os.remove(f) # noqa: PDF008 + os.remove(f) except OSError: # On Windows, attempting to remove a file that is in use # causes an exception to be raised diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3c1362b1ac83e..228cc5518495e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -8,14 +8,16 @@ # # Usage: # $ ./ci/code_checks.sh # run all checks +# $ ./ci/code_checks.sh lint # run linting only +# $ ./ci/code_checks.sh patterns # check for patterns that should not exist # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors # $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free # $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks -[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ - { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } +[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ + { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 @@ -38,6 +40,71 @@ if [[ "$GITHUB_ACTIONS" == "true" ]]; then INVGREP_PREPEND="##[error]" fi +### LINTING ### +if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then + + # Check that cython casting is of the form `obj` as opposed to ` obj`; + # it doesn't make a difference, but we want to be internally consistent. + # Note: this grep pattern is (intended to be) equivalent to the python + # regex r'(?])> ' + MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG + invgrep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # readability/casting: Warnings about C casting instead of C++ casting + # runtime/int: Warnings about using C number types instead of C++ ones + # build/include_subdir: Warnings about prefacing included header files with directory + +fi + +### PATTERNS ### +if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then + + MSG='Check for use of exec' ; echo $MSG + invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for pytest warns' ; echo $MSG + invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for pytest raises without context' ; echo $MSG + invgrep -r -E --include '*.py' "[[:space:]] pytest.raises" pandas/tests/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of builtin filter function' ; echo $MSG + invgrep -R --include="*.py" -P '(?`_ +standard and uses `Black `_ +and `Flake8 `_ to ensure a +consistent code format throughout the project. We encourage you to use +:ref:`pre-commit ` to automatically run ``black``, +``flake8``, ``isort``, and related code checks when you make a git commit. + +Patterns +======== + +Using foo.__class__ +------------------- + + +pandas uses 'type(foo)' instead 'foo.__class__' as it is making the code more +readable. +For example: + +**Good:** + +.. code-block:: python + + foo = "bar" + type(foo) + +**Bad:** + +.. code-block:: python + + foo = "bar" + foo.__class__ + + +String formatting +================= + +Concatenated strings +-------------------- + +Using f-strings +~~~~~~~~~~~~~~~ + +pandas uses f-strings formatting instead of '%' and '.format()' string formatters. + +The convention of using f-strings on a string that is concatenated over several lines, +is to prefix only the lines containing values which need to be interpreted. + +For example: + +**Good:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + "please use the new and way better " + f"{bar}" + ) + +**Bad:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + f"please use the new and way better " + f"{bar}" + ) + +White spaces +~~~~~~~~~~~~ + +Only put white space at the end of the previous line, so +there is no whitespace at the beginning of the concatenated string. + +For example: + +**Good:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string, " + "with good placement of the " + "whitespaces" + ) + +**Bad:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string," + " with bad placement of the" + " whitespaces" + ) + +Representation function (aka 'repr()') +-------------------------------------- + +pandas uses 'repr()' instead of '%r' and '!r'. + +The use of 'repr()' will only happen when the value is not an obvious string. + +For example: + +**Good:** + +.. code-block:: python + + value = str + f"Unknown received value, got: {repr(value)}" + +**Good:** + +.. code-block:: python + + value = str + f"Unknown received type, got: '{type(value).__name__}'" + + +Imports (aim for absolute) +========================== + +In Python 3, absolute imports are recommended. Using absolute imports, doing something +like ``import string`` will import the string module rather than ``string.py`` +in the same directory. As much as possible, you should try to write out +absolute imports that show the whole import chain from top-level pandas. + +Explicit relative imports are also supported in Python 3 but it is not +recommended to use them. Implicit relative imports should never be used +and are removed in Python 3. + +For example: + +:: + + # preferred + import pandas.core.common as com + + # not preferred + from .common import test_base + + # wrong + from common import test_base + +Testing +======= + +Failing tests +-------------- + +See https://docs.pytest.org/en/latest/skipping.html for background. + +Do not use ``pytest.xfail`` +--------------------------- + +Do not use this method. It has the same behavior as ``pytest.skip``, namely +it immediately stops the test and does not check if the test will fail. If +this is the behavior you desire, use ``pytest.skip`` instead. + +Using ``pytest.mark.xfail`` +--------------------------- + +Use this method if a test is known to fail but the manner in which it fails +is not meant to be captured. It is common to use this method for a test that +exhibits buggy behavior or a non-implemented feature. If +the failing test has flaky behavior, use the argument ``strict=False``. This +will make it so pytest does not fail if the test happens to pass. + +Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` +over usage within a test so that the test is appropriately marked during the +collection phase of pytest. For xfailing a test that involves multiple +parameters, a fixture, or a combination of these, it is only possible to +xfail during the testing phase. To do so, use the ``request`` fixture: + +.. code-block:: python + + import pytest + + def test_xfail(request): + request.node.add_marker(pytest.mark.xfail(reason="Indicate why here")) + +xfail is not to be used for tests involving failure due to invalid user arguments. +For these tests, we need to verify the correct exception type and error message +is being raised, using ``pytest.raises`` instead. + +Miscellaneous +============= + +Reading from a url +------------------ + +**Good:** + +.. code-block:: python + + from pandas.io.common import urlopen + + with urlopen("http://www.google.com") as url: + raw_text = url.read() diff --git a/environment.yml b/environment.yml index b6b8f7d6af1ba..9eaf52977d6d2 100644 --- a/environment.yml +++ b/environment.yml @@ -90,7 +90,6 @@ dependencies: - gitdb - natsort # DataFrame.sort_values doctest - numpydoc - - pandas-dev-flaker=0.5.0 - pydata-sphinx-theme<0.11 - pytest-cython # doctest - sphinx diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 43020ae471f10..eb2905751a9b4 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -886,7 +886,7 @@ def external_error_raised(expected_exception: type[Exception]) -> ContextManager """ import pytest - return pytest.raises(expected_exception, match=None) # noqa: PDF010 + return pytest.raises(expected_exception, match=None) cython_table = pd.core.common._cython_table.items() diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 995b1668046d2..e448e1bce9146 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -250,7 +250,7 @@ class TestTesting(Base): ] def test_testing(self): - from pandas import testing # noqa: PDF015 + from pandas import testing self.check(testing, self.funcs) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 1263d61b55cd5..2664c7df59223 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -911,7 +911,7 @@ def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): os.close(fd) except (OSError, ValueError): pass - os.remove(new_f) # noqa: PDF008 + os.remove(new_f) # new table df = tm.makeDataFrame() diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1347f6eb50b09..a95cc41f6397d 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -234,7 +234,9 @@ def test_type_check(errors): # see gh-11776 df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]}) kwargs = {"errors": errors} if errors is not None else {} - with pytest.raises(TypeError, match="1-d array"): + error_ctx = pytest.raises(TypeError, match="1-d array") + + with error_ctx: to_numeric(df, **kwargs) diff --git a/requirements-dev.txt b/requirements-dev.txt index 4f2a80d932fd0..428c2cb92d040 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -65,7 +65,6 @@ gitpython gitdb natsort numpydoc -pandas-dev-flaker==0.5.0 pydata-sphinx-theme<0.11 pytest-cython sphinx diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py new file mode 100644 index 0000000000000..3c21821e794a9 --- /dev/null +++ b/scripts/check_for_inconsistent_pandas_namespace.py @@ -0,0 +1,142 @@ +""" +Check that test suite file doesn't use the pandas namespace inconsistently. + +We check for cases of ``Series`` and ``pd.Series`` appearing in the same file +(likewise for other pandas objects). + +This is meant to be run as a pre-commit hook - to run it manually, you can do: + + pre-commit run inconsistent-namespace-usage --all-files + +To automatically fixup a given file, you can pass `--replace`, e.g. + + python scripts/check_for_inconsistent_pandas_namespace.py test_me.py --replace + +though note that you may need to manually fixup some imports and that you will also +need the additional dependency `tokenize-rt` (which is left out from the pre-commit +hook so that it uses the same virtualenv as the other local ones). + +The general structure is similar to that of some plugins from +https://github.com/asottile/pyupgrade . +""" + +import argparse +import ast +import sys +from typing import ( + MutableMapping, + NamedTuple, + Optional, + Sequence, + Set, +) + +ERROR_MESSAGE = ( + "{path}:{lineno}:{col_offset}: " + "Found both '{prefix}.{name}' and '{name}' in {path}" +) + + +class OffsetWithNamespace(NamedTuple): + lineno: int + col_offset: int + namespace: str + + +class Visitor(ast.NodeVisitor): + def __init__(self) -> None: + self.pandas_namespace: MutableMapping[OffsetWithNamespace, str] = {} + self.imported_from_pandas: Set[str] = set() + + def visit_Attribute(self, node: ast.Attribute) -> None: + if isinstance(node.value, ast.Name) and node.value.id in {"pandas", "pd"}: + offset_with_namespace = OffsetWithNamespace( + node.lineno, node.col_offset, node.value.id + ) + self.pandas_namespace[offset_with_namespace] = node.attr + self.generic_visit(node) + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + if node.module is not None and "pandas" in node.module: + self.imported_from_pandas.update(name.name for name in node.names) + self.generic_visit(node) + + +def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str: + from tokenize_rt import ( + reversed_enumerate, + src_to_tokens, + tokens_to_src, + ) + + tokens = src_to_tokens(content) + for n, i in reversed_enumerate(tokens): + offset_with_namespace = OffsetWithNamespace(i.offset[0], i.offset[1], i.src) + if ( + offset_with_namespace in visitor.pandas_namespace + and visitor.pandas_namespace[offset_with_namespace] + in visitor.imported_from_pandas + ): + # Replace `pd` + tokens[n] = i._replace(src="") + # Replace `.` + tokens[n + 1] = tokens[n + 1]._replace(src="") + + new_src: str = tokens_to_src(tokens) + return new_src + + +def check_for_inconsistent_pandas_namespace( + content: str, path: str, *, replace: bool +) -> Optional[str]: + tree = ast.parse(content) + + visitor = Visitor() + visitor.visit(tree) + + inconsistencies = visitor.imported_from_pandas.intersection( + visitor.pandas_namespace.values() + ) + + if not inconsistencies: + # No inconsistent namespace usage, nothing to replace. + return None + + if not replace: + inconsistency = inconsistencies.pop() + lineno, col_offset, prefix = next( + key for key, val in visitor.pandas_namespace.items() if val == inconsistency + ) + msg = ERROR_MESSAGE.format( + lineno=lineno, + col_offset=col_offset, + prefix=prefix, + name=inconsistency, + path=path, + ) + sys.stdout.write(msg) + sys.exit(1) + + return replace_inconsistent_pandas_namespace(visitor, content) + + +def main(argv: Optional[Sequence[str]] = None) -> None: + parser = argparse.ArgumentParser() + parser.add_argument("paths", nargs="*") + parser.add_argument("--replace", action="store_true") + args = parser.parse_args(argv) + + for path in args.paths: + with open(path, encoding="utf-8") as fd: + content = fd.read() + new_content = check_for_inconsistent_pandas_namespace( + content, path, replace=args.replace + ) + if not args.replace or new_content is None: + continue + with open(path, "w", encoding="utf-8") as fd: + fd.write(new_content) + + +if __name__ == "__main__": + main() diff --git a/scripts/tests/test_inconsistent_namespace_check.py b/scripts/tests/test_inconsistent_namespace_check.py new file mode 100644 index 0000000000000..eb995158d8cb4 --- /dev/null +++ b/scripts/tests/test_inconsistent_namespace_check.py @@ -0,0 +1,61 @@ +import pytest + +from ..check_for_inconsistent_pandas_namespace import ( + check_for_inconsistent_pandas_namespace, +) + +BAD_FILE_0 = ( + "from pandas import Categorical\n" + "cat_0 = Categorical()\n" + "cat_1 = pd.Categorical()" +) +BAD_FILE_1 = ( + "from pandas import Categorical\n" + "cat_0 = pd.Categorical()\n" + "cat_1 = Categorical()" +) +BAD_FILE_2 = ( + "from pandas import Categorical\n" + "cat_0 = pandas.Categorical()\n" + "cat_1 = Categorical()" +) +GOOD_FILE_0 = ( + "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = Categorical()" +) +GOOD_FILE_1 = "cat_0 = pd.Categorical()\ncat_1 = pd.Categorical()" +GOOD_FILE_2 = "from array import array\nimport pandas as pd\narr = pd.array([])" +PATH = "t.py" + + +@pytest.mark.parametrize( + "content, expected", + [ + (BAD_FILE_0, "t.py:3:8: Found both 'pd.Categorical' and 'Categorical' in t.py"), + (BAD_FILE_1, "t.py:2:8: Found both 'pd.Categorical' and 'Categorical' in t.py"), + ( + BAD_FILE_2, + "t.py:2:8: Found both 'pandas.Categorical' and 'Categorical' in t.py", + ), + ], +) +def test_inconsistent_usage(content, expected, capsys): + with pytest.raises(SystemExit): + check_for_inconsistent_pandas_namespace(content, PATH, replace=False) + result, _ = capsys.readouterr() + assert result == expected + + +@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1, GOOD_FILE_2]) +@pytest.mark.parametrize("replace", [True, False]) +def test_consistent_usage(content, replace): + # should not raise + check_for_inconsistent_pandas_namespace(content, PATH, replace=replace) + + +@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1, BAD_FILE_2]) +def test_inconsistent_usage_with_replace(content): + result = check_for_inconsistent_pandas_namespace(content, PATH, replace=True) + expected = ( + "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = Categorical()" + ) + assert result == expected diff --git a/scripts/tests/test_use_pd_array_in_core.py b/scripts/tests/test_use_pd_array_in_core.py index 8f13a6e735899..9c66199a82846 100644 --- a/scripts/tests/test_use_pd_array_in_core.py +++ b/scripts/tests/test_use_pd_array_in_core.py @@ -14,7 +14,7 @@ def test_inconsistent_usage(content, capsys): result_msg = ( "t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n" ) - with pytest.raises(SystemExit, match=None): + with pytest.raises(SystemExit): use_pd_array(content, PATH) expected_msg, _ = capsys.readouterr() assert result_msg == expected_msg diff --git a/scripts/tests/test_validate_unwanted_patterns.py b/scripts/tests/test_validate_unwanted_patterns.py new file mode 100644 index 0000000000000..ef93fd1d21981 --- /dev/null +++ b/scripts/tests/test_validate_unwanted_patterns.py @@ -0,0 +1,419 @@ +import io + +import pytest + +from .. import validate_unwanted_patterns + + +class TestBarePytestRaises: + @pytest.mark.parametrize( + "data", + [ + ( + """ + with pytest.raises(ValueError, match="foo"): + pass + """ + ), + ( + """ + # with pytest.raises(ValueError, match="foo"): + # pass + """ + ), + ( + """ + # with pytest.raises(ValueError): + # pass + """ + ), + ( + """ + with pytest.raises( + ValueError, + match="foo" + ): + pass + """ + ), + ], + ) + def test_pytest_raises(self, data): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + with pytest.raises(ValueError): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError, match="foo"): + with pytest.raises(ValueError): + pass + pass + """ + ), + [ + ( + 2, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError): + with pytest.raises(ValueError, match="foo"): + pass + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError, + # match = "foo" + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ], + ) + def test_pytest_raises_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == expected + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + 'msg = ("bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + ], + ), + ( + 'msg = ("foo " "bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ], + ), + ], +) +def test_strings_to_concatenate(data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.strings_to_concatenate(fd)) + assert result == expected + + +class TestStringsWithWrongPlacedWhitespace: + @pytest.mark.parametrize( + "data", + [ + ( + """ + msg = ( + "foo\n" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + " bar" + "baz" + ) + """ + ), + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + rf" bar" + ) + """ + ), + ], + ) + def test_strings_with_wrong_placed_whitespace(self, data): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + msg = ( + "foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + " baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + " bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ], + ) + def test_strings_with_wrong_placed_whitespace_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == expected diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py new file mode 100755 index 0000000000000..f90fd6c3e8a3e --- /dev/null +++ b/scripts/validate_unwanted_patterns.py @@ -0,0 +1,487 @@ +#!/usr/bin/env python3 +""" +Unwanted patterns test cases. + +The reason this file exist despite the fact we already have +`ci/code_checks.sh`, +(see https://github.com/pandas-dev/pandas/blob/master/ci/code_checks.sh) + +is that some of the test cases are more complex/impossible to validate via regex. +So this file is somewhat an extensions to `ci/code_checks.sh` +""" + +import argparse +import ast +import sys +import token +import tokenize +from typing import ( + IO, + Callable, + Iterable, + List, + Set, + Tuple, +) + +PRIVATE_IMPORTS_TO_IGNORE: Set[str] = { + "_extension_array_shared_docs", + "_index_shared_docs", + "_interval_shared_docs", + "_merge_doc", + "_shared_docs", + "_apply_docs", + "_new_Index", + "_new_PeriodIndex", + "_doc_template", + "_agg_template", + "_pipe_template", + "__main__", + "_transform_template", + "_flex_comp_doc_FRAME", + "_op_descriptions", + "_IntegerDtype", + "_use_inf_as_na", + "_get_plot_backend", + "_matplotlib", + "_arrow_utils", + "_registry", + "_get_offset", # TODO: remove after get_offset deprecation enforced + "_test_parse_iso8601", + "_json_normalize", # TODO: remove after deprecation is enforced + "_testing", + "_test_decorators", + "__version__", # check np.__version__ in compat.numpy.function +} + + +def _get_literal_string_prefix_len(token_string: str) -> int: + """ + Getting the length of the literal string prefix. + + Parameters + ---------- + token_string : str + String to check. + + Returns + ------- + int + Length of the literal string prefix. + + Examples + -------- + >>> example_string = "'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 0 + >>> example_string = "r'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 1 + """ + try: + return min( + token_string.find(quote) + for quote in (r"'", r'"') + if token_string.find(quote) >= 0 + ) + except ValueError: + return 0 + + +def bare_pytest_raises(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Test Case for bare pytest raises. + + For example, this is wrong: + + >>> with pytest.raise(ValueError): + ... # Some code that raises ValueError + + And this is what we want instead: + + >>> with pytest.raise(ValueError, match="foo"): + ... # Some code that raises ValueError + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explanation of the error. + + Notes + ----- + GH #23922 + """ + contents = file_obj.read() + tree = ast.parse(contents) + + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + + try: + if not (node.func.value.id == "pytest" and node.func.attr == "raises"): + continue + except AttributeError: + continue + + if not node.keywords: + yield ( + node.lineno, + "Bare pytests raise have been found. " + "Please pass in the argument 'match' as well the exception.", + ) + else: + # Means that there are arguments that are being passed in, + # now we validate that `match` is one of the passed in arguments + if not any(keyword.arg == "match" for keyword in node.keywords): + yield ( + node.lineno, + "Bare pytests raise have been found. " + "Please pass in the argument 'match' as well the exception.", + ) + + +PRIVATE_FUNCTIONS_ALLOWED = {"sys._getframe"} # no known alternative + + +def private_function_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Checking that a private function is not used across modules. + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + Yields + ------ + line_number : int + Line number of the private function that is used across modules. + msg : str + Explanation of the error. + """ + contents = file_obj.read() + tree = ast.parse(contents) + + imported_modules: Set[str] = set() + + for node in ast.walk(tree): + if isinstance(node, (ast.Import, ast.ImportFrom)): + for module in node.names: + module_fqdn = module.name if module.asname is None else module.asname + imported_modules.add(module_fqdn) + + if not isinstance(node, ast.Call): + continue + + try: + module_name = node.func.value.id + function_name = node.func.attr + except AttributeError: + continue + + # Exception section # + + # (Debatable) Class case + if module_name[0].isupper(): + continue + # (Debatable) Dunder methods case + elif function_name.startswith("__") and function_name.endswith("__"): + continue + elif module_name + "." + function_name in PRIVATE_FUNCTIONS_ALLOWED: + continue + + if module_name in imported_modules and function_name.startswith("_"): + yield (node.lineno, f"Private function '{module_name}.{function_name}'") + + +def private_import_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Checking that a private function is not imported across modules. + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + Yields + ------ + line_number : int + Line number of import statement, that imports the private function. + msg : str + Explanation of the error. + """ + contents = file_obj.read() + tree = ast.parse(contents) + + for node in ast.walk(tree): + if not (isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom)): + continue + + for module in node.names: + module_name = module.name.split(".")[-1] + if module_name in PRIVATE_IMPORTS_TO_IGNORE: + continue + + if module_name.startswith("_"): + yield (node.lineno, f"Import of internal function {repr(module_name)}") + + +def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + This test case is necessary after 'Black' (https://github.com/psf/black), + is formatting strings over multiple lines. + + For example, when this: + + >>> foo = ( + ... "bar " + ... "baz" + ... ) + + Is becoming this: + + >>> foo = ("bar " "baz") + + 'Black' is not considering this as an + issue (see https://github.com/psf/black/issues/1051), + so we are checking it here instead. + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explanation of the error. + + Notes + ----- + GH #30454 + """ + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token.type == next_token.type == token.STRING: + yield ( + current_token.start[0], + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + + +def strings_with_wrong_placed_whitespace( + file_obj: IO[str], +) -> Iterable[Tuple[int, str]]: + """ + Test case for leading spaces in concated strings. + + For example: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + Instead of: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explanation of the error. + """ + + def has_wrong_whitespace(first_line: str, second_line: str) -> bool: + """ + Checking if the two lines are mattching the unwanted pattern. + + Parameters + ---------- + first_line : str + First line to check. + second_line : str + Second line to check. + + Returns + ------- + bool + True if the two received string match, an unwanted pattern. + + Notes + ----- + The unwanted pattern that we are trying to catch is if the spaces in + a string that is concatenated over multiple lines are placed at the + end of each string, unless this string is ending with a + newline character (\n). + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + And what we want is: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + And if the string is ending with a new line character (\n) we + do not want any trailing whitespaces after it. + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n " + ... "not at the end, like always" + ... ) + + And what we do want is: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n" + ... " not at the end, like always" + ... ) + """ + if first_line.endswith(r"\n"): + return False + elif first_line.startswith(" ") or second_line.startswith(" "): + return False + elif first_line.endswith(" ") or second_line.endswith(" "): + return False + elif (not first_line.endswith(" ")) and second_line.startswith(" "): + return True + return False + + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for first_token, second_token, third_token in zip(tokens, tokens[1:], tokens[2:]): + # Checking if we are in a block of concated string + if ( + first_token.type == third_token.type == token.STRING + and second_token.type == token.NL + ): + # Striping the quotes, with the string literal prefix + first_string: str = first_token.string[ + _get_literal_string_prefix_len(first_token.string) + 1 : -1 + ] + second_string: str = third_token.string[ + _get_literal_string_prefix_len(third_token.string) + 1 : -1 + ] + + if has_wrong_whitespace(first_string, second_string): + yield ( + third_token.start[0], + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + + +def main( + function: Callable[[IO[str]], Iterable[Tuple[int, str]]], + source_path: str, + output_format: str, +) -> bool: + """ + Main entry point of the script. + + Parameters + ---------- + function : Callable + Function to execute for the specified validation type. + source_path : str + Source path representing path to a file/directory. + output_format : str + Output format of the error message. + file_extensions_to_check : str + Comma separated values of what file extensions to check. + excluded_file_paths : str + Comma separated values of what file paths to exclude during the check. + + Returns + ------- + bool + True if found any patterns are found related to the given function. + + Raises + ------ + ValueError + If the `source_path` is not pointing to existing file/directory. + """ + is_failed: bool = False + + for file_path in source_path: + with open(file_path, encoding="utf-8") as file_obj: + for line_number, msg in function(file_obj): + is_failed = True + print( + output_format.format( + source_path=file_path, line_number=line_number, msg=msg + ) + ) + + return is_failed + + +if __name__ == "__main__": + available_validation_types: List[str] = [ + "bare_pytest_raises", + "private_function_across_module", + "private_import_across_module", + "strings_to_concatenate", + "strings_with_wrong_placed_whitespace", + ] + + parser = argparse.ArgumentParser(description="Unwanted patterns checker.") + + parser.add_argument("paths", nargs="*", help="Source paths of files to check.") + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}", + help="Output format of the error message.", + ) + parser.add_argument( + "--validation-type", + "-vt", + choices=available_validation_types, + required=True, + help="Validation test case to check.", + ) + + args = parser.parse_args() + + sys.exit( + main( + function=globals().get(args.validation_type), + source_path=args.paths, + output_format=args.format, + ) + ) From a747171f438aa35584928f802b043f56d4f5cd7c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 31 Dec 2022 19:00:45 +0000 Subject: [PATCH 02/13] fixups --- .pre-commit-config.yaml | 24 +- doc/scripts/eval_performance.py | 3 +- doc/source/development/code_style.rst | 218 ------------------ .../development/contributing_codebase.rst | 2 +- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/__init__.py | 2 +- pandas/_libs/lib.pyx | 2 +- pandas/compat/__init__.py | 11 +- pandas/core/arrays/string_arrow.py | 4 +- pandas/core/generic.py | 2 +- pandas/core/interchange/dataframe.py | 8 +- pandas/core/internals/managers.py | 4 +- pandas/core/strings/base.py | 4 +- pandas/core/strings/object_array.py | 4 +- pandas/core/window/ewm.py | 2 +- pandas/io/common.py | 6 +- pandas/io/json/_json.py | 4 +- pandas/io/xml.py | 11 +- pandas/tests/arrays/boolean/test_function.py | 2 +- pandas/tests/io/formats/style/test_style.py | 4 +- pandas/tests/io/pytables/test_round_trip.py | 2 +- pandas/tests/io/test_common.py | 4 +- pandas/tests/io/test_compression.py | 2 +- pandas/tests/io/test_pickle.py | 2 +- pandas/tests/tools/test_to_numeric.py | 4 +- scripts/sync_flake8_versions.py | 13 +- scripts/tests/test_sync_flake8_versions.py | 3 - scripts/tests/test_use_pd_array_in_core.py | 2 +- setup.cfg | 16 -- 29 files changed, 57 insertions(+), 310 deletions(-) delete mode 100644 doc/source/development/code_style.rst diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f33353d8f6a38..0d2179bc3e0e1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -181,15 +181,10 @@ repos: types: [rst] args: [--filename=*.rst] additional_dependencies: [flake8-rst==0.7.0, flake8==3.7.9] - - id: frame-or-series-union - name: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias - entry: Union\[.*(Series,.*DataFrame|DataFrame,.*Series).*\] - language: pygrep - types: [python] - exclude: ^pandas/_typing\.py$ - id: inconsistent-namespace-usage name: 'Check for inconsistent use of pandas namespace' entry: python scripts/check_for_inconsistent_pandas_namespace.py + exclude: ^pandas/core/interchange/ language: python types: [python] - id: no-os-remove @@ -200,9 +195,7 @@ repos: files: ^pandas/tests/ exclude: | (?x)^ - pandas/tests/io/excel/test_writers\.py - |pandas/tests/io/pytables/common\.py - |pandas/tests/io/pytables/test_store\.py$ + pandas/tests/io/pytables/test_store\.py$ - id: unwanted-patterns name: Unwanted patterns language: pygrep @@ -216,12 +209,8 @@ repos: |\.__class__ # np.bool/np.object instead of np.bool_/np.object_ - |np\.bool[^_8] - |np\.object[^_8] - - # imports from pandas.core.common instead of `import pandas.core.common as com` - |from\ pandas\.core\.common\ import - |from\ pandas\.core\ import\ common + |np\.bool[^_8`] + |np\.object[^_8`] # imports from collections.abc instead of `from collections import abc` |from\ collections\.abc\ import @@ -331,7 +320,10 @@ repos: language: python entry: python scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" types: [python] - exclude: ^(asv_bench|pandas/tests|doc)/ + exclude: | + (?x) + ^(asv_bench|pandas/tests|doc)/ + |scripts/validate_min_versions_in_sync\.py$ - id: unwanted-patterns-strings-to-concatenate name: Check for use of not concatenated strings language: python diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py index 85d9ce4ad01e9..f6087e02a9330 100644 --- a/doc/scripts/eval_performance.py +++ b/doc/scripts/eval_performance.py @@ -6,8 +6,7 @@ from pandas import DataFrame setup_common = """from pandas import DataFrame -from numpy.random import randn -df = DataFrame(randn(%d, 3), columns=list('abc')) +df = DataFrame(np.random.randn(%d, 3), columns=list('abc')) %s""" setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst deleted file mode 100644 index 19d83eb75e5bd..0000000000000 --- a/doc/source/development/code_style.rst +++ /dev/null @@ -1,218 +0,0 @@ -.. _code_style: - -{{ header }} - -======================= -pandas code style guide -======================= - -.. contents:: Table of contents: - :local: - -pandas follows the `PEP8 `_ -standard and uses `Black `_ -and `Flake8 `_ to ensure a -consistent code format throughout the project. We encourage you to use -:ref:`pre-commit ` to automatically run ``black``, -``flake8``, ``isort``, and related code checks when you make a git commit. - -Patterns -======== - -Using foo.__class__ -------------------- - - -pandas uses 'type(foo)' instead 'foo.__class__' as it is making the code more -readable. -For example: - -**Good:** - -.. code-block:: python - - foo = "bar" - type(foo) - -**Bad:** - -.. code-block:: python - - foo = "bar" - foo.__class__ - - -String formatting -================= - -Concatenated strings --------------------- - -Using f-strings -~~~~~~~~~~~~~~~ - -pandas uses f-strings formatting instead of '%' and '.format()' string formatters. - -The convention of using f-strings on a string that is concatenated over several lines, -is to prefix only the lines containing values which need to be interpreted. - -For example: - -**Good:** - -.. code-block:: python - - foo = "old_function" - bar = "new_function" - - my_warning_message = ( - f"Warning, {foo} is deprecated, " - "please use the new and way better " - f"{bar}" - ) - -**Bad:** - -.. code-block:: python - - foo = "old_function" - bar = "new_function" - - my_warning_message = ( - f"Warning, {foo} is deprecated, " - f"please use the new and way better " - f"{bar}" - ) - -White spaces -~~~~~~~~~~~~ - -Only put white space at the end of the previous line, so -there is no whitespace at the beginning of the concatenated string. - -For example: - -**Good:** - -.. code-block:: python - - example_string = ( - "Some long concatenated string, " - "with good placement of the " - "whitespaces" - ) - -**Bad:** - -.. code-block:: python - - example_string = ( - "Some long concatenated string," - " with bad placement of the" - " whitespaces" - ) - -Representation function (aka 'repr()') --------------------------------------- - -pandas uses 'repr()' instead of '%r' and '!r'. - -The use of 'repr()' will only happen when the value is not an obvious string. - -For example: - -**Good:** - -.. code-block:: python - - value = str - f"Unknown received value, got: {repr(value)}" - -**Good:** - -.. code-block:: python - - value = str - f"Unknown received type, got: '{type(value).__name__}'" - - -Imports (aim for absolute) -========================== - -In Python 3, absolute imports are recommended. Using absolute imports, doing something -like ``import string`` will import the string module rather than ``string.py`` -in the same directory. As much as possible, you should try to write out -absolute imports that show the whole import chain from top-level pandas. - -Explicit relative imports are also supported in Python 3 but it is not -recommended to use them. Implicit relative imports should never be used -and are removed in Python 3. - -For example: - -:: - - # preferred - import pandas.core.common as com - - # not preferred - from .common import test_base - - # wrong - from common import test_base - -Testing -======= - -Failing tests --------------- - -See https://docs.pytest.org/en/latest/skipping.html for background. - -Do not use ``pytest.xfail`` ---------------------------- - -Do not use this method. It has the same behavior as ``pytest.skip``, namely -it immediately stops the test and does not check if the test will fail. If -this is the behavior you desire, use ``pytest.skip`` instead. - -Using ``pytest.mark.xfail`` ---------------------------- - -Use this method if a test is known to fail but the manner in which it fails -is not meant to be captured. It is common to use this method for a test that -exhibits buggy behavior or a non-implemented feature. If -the failing test has flaky behavior, use the argument ``strict=False``. This -will make it so pytest does not fail if the test happens to pass. - -Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` -over usage within a test so that the test is appropriately marked during the -collection phase of pytest. For xfailing a test that involves multiple -parameters, a fixture, or a combination of these, it is only possible to -xfail during the testing phase. To do so, use the ``request`` fixture: - -.. code-block:: python - - import pytest - - def test_xfail(request): - request.node.add_marker(pytest.mark.xfail(reason="Indicate why here")) - -xfail is not to be used for tests involving failure due to invalid user arguments. -For these tests, we need to verify the correct exception type and error message -is being raised, using ``pytest.raises`` instead. - -Miscellaneous -============= - -Reading from a url ------------------- - -**Good:** - -.. code-block:: python - - from pandas.io.common import urlopen - - with urlopen("http://www.google.com") as url: - raw_text = url.read() diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index b05f026bbbb44..449b6de36cd24 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -43,7 +43,7 @@ Pre-commit ---------- Additionally, :ref:`Continuous Integration ` will run code formatting checks -like ``black``, ``flake8`` (including a `pandas-dev-flaker `_ plugin), +like ``black``, ``flake8``, ``isort``, and ``cpplint`` and more using `pre-commit hooks `_ Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, it is helpful to run the check yourself before submitting code. This diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5895a06792ffb..9dbe450261e54 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -320,7 +320,7 @@ Null-values are no longer coerced to NaN-value in value_counts and mode ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, -``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This +``NaT`` and other null-values to a NaN-value for ``np.object_``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`). diff --git a/pandas/__init__.py b/pandas/__init__.py index 951cb38656d0b..048d20f0de72f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -135,7 +135,7 @@ ) from pandas import api, arrays, errors, io, plotting, tseries -from pandas import testing # noqa:PDF015 +from pandas import testing from pandas.util._print_versions import show_versions from pandas.io.api import ( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc7b876cb5de8..b56cf2a23a45f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1482,7 +1482,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return val if values.descr.type_num != NPY_OBJECT: - # i.e. values.dtype != np.object + # i.e. values.dtype != np.object_ # This should not be reached values = values.astype(object) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 085a2a80ca8ec..63e7f9c61ccb1 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,7 +14,10 @@ import sys from pandas._typing import F -import pandas.compat._compressors +from pandas.compat._compressors import ( + LZMAFile, + has_lzma, +) from pandas.compat._constants import ( IS64, PY39, @@ -131,7 +134,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: +def get_lzma_file() -> type[LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -145,13 +148,13 @@ def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - if not pandas.compat._compressors.has_lzma: + if not has_lzma: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return pandas.compat._compressors.LZMAFile + return LZMAFile __all__ = [ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97262b1f4bb21..40d3bcfe7bf12 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Callable # noqa: PDF001 +from collections import abc import re from typing import Union @@ -302,7 +302,7 @@ def _str_endswith(self, pat: str, na=None): def _str_replace( self, pat: str | re.Pattern, - repl: str | Callable, + repl: str | abc.Callable, n: int = -1, case: bool = True, flags: int = 0, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c893e9ce3d9a9..4ad0faa81eb2a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -133,11 +133,11 @@ from pandas.core import ( algorithms as algos, arraylike, + common, indexing, nanops, sample, ) -from pandas.core import common # noqa: PDF018 from pandas.core.array_algos.replace import should_use_regex from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 9139cb41e3af7..0de9b130f0aab 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -7,8 +7,10 @@ from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg if TYPE_CHECKING: - import pandas as pd - from pandas import Index + from pandas import ( + DataFrame, + Index, + ) class PandasDataFrameXchg(DataFrameXchg): @@ -21,7 +23,7 @@ class PandasDataFrameXchg(DataFrameXchg): """ def __init__( - self, df: pd.DataFrame, nan_as_null: bool = False, allow_copy: bool = True + self, df: DataFrame, nan_as_null: bool = False, allow_copy: bool = True ) -> None: """ Constructor - an instance of this (private) class is returned from diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 53f347ec4d372..c70a7e29bf3ce 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -15,7 +15,7 @@ import numpy as np -from pandas._config.config import _global_config +from pandas._config import config from pandas._libs import ( algos as libalgos, @@ -2354,7 +2354,7 @@ def _preprocess_slice_or_indexer( return "fancy", indexer, len(indexer) -_mode_options = _global_config["mode"] +_mode_options = config._global_config["mode"] def _using_copy_on_write(): diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index b5618207ab9d8..6cce14de61637 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,7 +1,7 @@ from __future__ import annotations import abc -from collections.abc import Callable # noqa: PDF001 +import collections import re from typing import ( TYPE_CHECKING, @@ -69,7 +69,7 @@ def _str_endswith(self, pat, na=None): def _str_replace( self, pat: str | re.Pattern, - repl: str | Callable, + repl: str | collections.abc.Callable, n: int = -1, case: bool = True, flags: int = 0, diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 2d77cd0da816f..fb2478bfbe44e 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Callable # noqa: PDF001 +from collections import abc import functools import re import sys @@ -153,7 +153,7 @@ def _str_endswith(self, pat, na=None): def _str_replace( self, pat: str | re.Pattern, - repl: str | Callable, + repl: str | abc.Callable, n: int = -1, case: bool = True, flags: int = 0, diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index a6c32133803d4..c0a7b2b7cc361 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -26,7 +26,7 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core import common # noqa: PDF018 +from pandas.core import common from pandas.core.indexers.objects import ( BaseIndexer, ExponentialMovingWindowIndexer, diff --git a/pandas/io/common.py b/pandas/io/common.py index 4dae46c8f39f6..338bb6f50ea5d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -478,7 +478,7 @@ def file_path_to_url(path: str) -> str: return urljoin("file:", pathname2url(path)) -_extension_to_compression = { +extension_to_compression = { ".tar": "tar", ".tar.gz": "tar", ".tar.bz2": "tar", @@ -489,7 +489,7 @@ def file_path_to_url(path: str) -> str: ".xz": "xz", ".zst": "zstd", } -_supported_compressions = set(_extension_to_compression.values()) +_supported_compressions = set(extension_to_compression.values()) def get_compression_method( @@ -565,7 +565,7 @@ def infer_compression( return None # Infer compression from the filename/URL extension - for extension, compression in _extension_to_compression.items(): + for extension, compression in extension_to_compression.items(): if filepath_or_buffer.lower().endswith(extension): return compression return None diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f2780d5fa6832..88974f3ab4afa 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -57,7 +57,7 @@ from pandas.io.common import ( IOHandles, - _extension_to_compression, + extension_to_compression, file_exists, get_handle, is_fsspec_url, @@ -854,7 +854,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): elif ( isinstance(filepath_or_buffer, str) and filepath_or_buffer.lower().endswith( - (".json",) + tuple(f".json{c}" for c in _extension_to_compression) + (".json",) + tuple(f".json{c}" for c in extension_to_compression) ) and not file_exists(filepath_or_buffer) ): diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 4f61455826286..44e550243b938 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -46,10 +46,7 @@ if TYPE_CHECKING: from xml.etree.ElementTree import Element - from lxml.etree import ( - _Element, - _XSLTResultTree, - ) + from lxml import etree from pandas import DataFrame @@ -417,7 +414,7 @@ def _validate_names(self) -> None: def _parse_doc( self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] - ) -> Element | _Element: + ) -> Element | etree._Element: """ Build tree from path_or_buffer. @@ -625,7 +622,7 @@ def _validate_names(self) -> None: def _parse_doc( self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] - ) -> _Element: + ) -> etree._Element: from lxml.etree import ( XMLParser, fromstring, @@ -656,7 +653,7 @@ def _parse_doc( return document - def _transform_doc(self) -> _XSLTResultTree: + def _transform_doc(self) -> etree._XSLTResultTree: """ Transform original tree using stylesheet. diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 8e9112b531fad..b484dc39cf23b 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -67,7 +67,7 @@ def test_ufuncs_unary(ufunc): def test_ufunc_numeric(): - # np.sqrt on np.bool returns float16, which we upcast to Float32 + # np.sqrt on np.bool_ returns float16, which we upcast to Float32 # bc we do not have Float16 arr = pd.array([True, False, None], dtype="boolean") diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 32ab0336aa93f..46fb614d96633 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -707,13 +707,13 @@ def test_applymap_subset_multiindex(self, slice_): and isinstance(slice_[-1][-1], list) and "C" in slice_[-1][-1] ): - ctx = pytest.raises(KeyError, match="C") # noqa: PDF010 + ctx = pytest.raises(KeyError, match="C") elif ( isinstance(slice_[0], tuple) and isinstance(slice_[0][1], list) and 3 in slice_[0][1] ): - ctx = pytest.raises(KeyError, match="3") # noqa: PDF010 + ctx = pytest.raises(KeyError, match="3") else: ctx = contextlib.nullcontext() diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 53be06cd491ef..fdae2d47b6389 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -357,7 +357,7 @@ def test_timeseries_preepoch(setup_path): _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) except OverflowError: if is_platform_windows(): - pytest.xfail("known failure on some windows platforms") + pytest.mark.xfail("known failure on some windows platforms") else: raise diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f74d268690a5b..2172a4bf839fb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -317,7 +317,7 @@ def test_read_expands_user_home_dir( ), ], ) - @pytest.mark.filterwarnings( # pytables np.object usage + @pytest.mark.filterwarnings( # pytables np.object_ usage "ignore:`np.object` is a deprecated alias:DeprecationWarning" ) def test_read_fspath_all(self, reader, module, path, datapath): @@ -372,7 +372,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.filterwarnings( # pytables np.object usage + @pytest.mark.filterwarnings( # pytables np.object_ usage "ignore:`np.object` is a deprecated alias:DeprecationWarning" ) def test_write_fspath_hdf5(self): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 782753177f245..fc15ff3488ce9 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -19,7 +19,7 @@ import pandas.io.common as icom _compression_to_extension = { - value: key for key, value in icom._extension_to_compression.items() + value: key for key, value in icom.extension_to_compression.items() } diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3dafe6fe61b35..24ac1b72f1476 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -255,7 +255,7 @@ def get_random_path(): class TestCompression: - _extension_to_compression = icom._extension_to_compression + _extension_to_compression = icom.extension_to_compression def compress_file(self, src_path, dest_path, compression): if compression is None: diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index a95cc41f6397d..1347f6eb50b09 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -234,9 +234,7 @@ def test_type_check(errors): # see gh-11776 df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]}) kwargs = {"errors": errors} if errors is not None else {} - error_ctx = pytest.raises(TypeError, match="1-d array") - - with error_ctx: + with pytest.raises(TypeError, match="1-d array"): to_numeric(df, **kwargs) diff --git a/scripts/sync_flake8_versions.py b/scripts/sync_flake8_versions.py index 8852634c5d796..0d513d5937dbe 100644 --- a/scripts/sync_flake8_versions.py +++ b/scripts/sync_flake8_versions.py @@ -1,5 +1,5 @@ """ -Check that the flake8 (and pandas-dev-flaker) pins are the same in: +Check that the flake8 pins are the same in: - environment.yml - .pre-commit-config.yaml, in the flake8 hook @@ -103,17 +103,13 @@ def get_revisions( precommit_config: YamlMapping, environment: YamlMapping ) -> tuple[Revisions, Revisions]: flake8_revisions = Revisions(name="flake8") - pandas_dev_flaker_revisions = Revisions(name="pandas-dev-flaker") repos = precommit_config["repos"] flake8_repo, flake8_hook = _get_repo_hook(repos, "flake8") flake8_revisions.pre_commit = Revision("flake8", "==", flake8_repo["rev"]) flake8_additional_dependencies = [] for dep in _process_dependencies(flake8_hook.get("additional_dependencies", [])): - if dep.name == "pandas-dev-flaker": - pandas_dev_flaker_revisions.pre_commit = dep - else: - flake8_additional_dependencies.append(dep) + flake8_additional_dependencies.append(dep) environment_dependencies = environment["dependencies"] environment_additional_dependencies = [] @@ -121,8 +117,6 @@ def get_revisions( if dep.name == "flake8": flake8_revisions.environment = dep environment_additional_dependencies.append(dep) - elif dep.name == "pandas-dev-flaker": - pandas_dev_flaker_revisions.environment = dep else: environment_additional_dependencies.append(dep) @@ -131,8 +125,7 @@ def get_revisions( environment_additional_dependencies, ) - for revisions in flake8_revisions, pandas_dev_flaker_revisions: - _validate_revisions(revisions) + _validate_revisions(flake8_revisions) if __name__ == "__main__": diff --git a/scripts/tests/test_sync_flake8_versions.py b/scripts/tests/test_sync_flake8_versions.py index 743ece34e0b56..2349a4f5d8d1c 100644 --- a/scripts/tests/test_sync_flake8_versions.py +++ b/scripts/tests/test_sync_flake8_versions.py @@ -87,7 +87,6 @@ def test_get_revisions_no_failure(capsys): { "id": "flake8", "additional_dependencies": [ - "pandas-dev-flaker==0.4.0", "flake8-bugs==1.1.1", ], } @@ -101,7 +100,6 @@ def test_get_revisions_no_failure(capsys): "id": "yesqa", "additional_dependencies": [ "flake8==0.1.1", - "pandas-dev-flaker==0.4.0", "flake8-bugs==1.1.1", ], } @@ -116,7 +114,6 @@ def test_get_revisions_no_failure(capsys): { "pip": [ "git+https://github.com/pydata/pydata-sphinx-theme.git@master", - "pandas-dev-flaker==0.4.0", ] }, ] diff --git a/scripts/tests/test_use_pd_array_in_core.py b/scripts/tests/test_use_pd_array_in_core.py index 9c66199a82846..8f13a6e735899 100644 --- a/scripts/tests/test_use_pd_array_in_core.py +++ b/scripts/tests/test_use_pd_array_in_core.py @@ -14,7 +14,7 @@ def test_inconsistent_usage(content, capsys): result_msg = ( "t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n" ) - with pytest.raises(SystemExit): + with pytest.raises(SystemExit, match=None): use_pd_array(content, PATH) expected_msg, _ = capsys.readouterr() assert result_msg == expected_msg diff --git a/setup.cfg b/setup.cfg index ef84dd7f9ce85..562ae70fd73ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,10 +35,6 @@ ignore = B023 # Functions defined inside a loop must not use variables redefined in the loop B301, - # single-letter variables - PDF023, - # "use 'pandas._testing' instead" in non-test code - PDF025, # If test must be a simple comparison against sys.platform or sys.version_info Y002, # Use "_typeshed.Self" instead of class-bound TypeVar @@ -59,18 +55,6 @@ exclude = versioneer.py, # exclude asv benchmark environments from linting env -per-file-ignores = - # private import across modules - pandas/tests/*:PDF020 - # pytest.raises without match= - pandas/tests/extension/*:PDF009 - # os.remove - doc/make.py:PDF008 - # import from pandas._testing - pandas/testing.py:PDF014 - # can't use fixtures in asv - asv_bench/*:PDF016 - [flake8-rst] max-line-length = 84 From 0358a9105e73b195bd781dfbc261733908bc6d7a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 31 Dec 2022 21:05:25 +0000 Subject: [PATCH 03/13] fixup? --- pandas/compat/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 63e7f9c61ccb1..05510c1713ef1 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,10 +14,7 @@ import sys from pandas._typing import F -from pandas.compat._compressors import ( - LZMAFile, - has_lzma, -) +import pandas.compat from pandas.compat._constants import ( IS64, PY39, @@ -134,7 +131,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[LZMAFile]: +def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -148,13 +145,13 @@ def get_lzma_file() -> type[LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - if not has_lzma: + if not pandas.compat._compressors.has_lzma: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return LZMAFile + return pandas.compat._compressors.LZMAFile __all__ = [ From 8036f386699a29de5e4c440c9c13ba47d3ba5fc2 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 1 Jan 2023 08:10:10 +0000 Subject: [PATCH 04/13] make _compressors public --- .pre-commit-config.yaml | 4 ++-- pandas/compat/__init__.py | 8 ++++---- pandas/compat/{_compressors.py => compressors.py} | 0 pandas/io/common.py | 2 +- pandas/tests/io/test_pickle.py | 2 +- scripts/validate_unwanted_patterns.py | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) rename pandas/compat/{_compressors.py => compressors.py} (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d2179bc3e0e1..a47503de3c227 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -329,8 +329,8 @@ repos: language: python entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" types_or: [python, cython] - - id: unwanted-patterns-strings-with-wrong-placed-whitespace - name: Check for strings with wrong placed spaces + - id: unwanted-patterns-strings-with-misplaced-whitespace + name: Check for strings with misplaced spaces language: python entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" types_or: [python, cython] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 05510c1713ef1..b59b9632913e4 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,7 +14,6 @@ import sys from pandas._typing import F -import pandas.compat from pandas.compat._constants import ( IS64, PY39, @@ -22,6 +21,7 @@ PY311, PYPY, ) +import pandas.compat.compressors from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, @@ -131,7 +131,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: +def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -145,13 +145,13 @@ def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - if not pandas.compat._compressors.has_lzma: + if not pandas.compat.compressors.has_lzma: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return pandas.compat._compressors.LZMAFile + return pandas.compat.compressors.LZMAFile __all__ = [ diff --git a/pandas/compat/_compressors.py b/pandas/compat/compressors.py similarity index 100% rename from pandas/compat/_compressors.py rename to pandas/compat/compressors.py diff --git a/pandas/io/common.py b/pandas/io/common.py index 338bb6f50ea5d..6deaf40f00c69 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -55,8 +55,8 @@ WriteBuffer, ) from pandas.compat import get_lzma_file -from pandas.compat._compressors import BZ2File as _BZ2File from pandas.compat._optional import import_optional_dependency +from pandas.compat.compressors import BZ2File as _BZ2File from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 24ac1b72f1476..07a028a19d7f9 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -37,8 +37,8 @@ get_lzma_file, is_platform_little_endian, ) -from pandas.compat._compressors import flatten_buffer from pandas.compat._optional import import_optional_dependency +from pandas.compat.compressors import flatten_buffer import pandas.util._test_decorators as td import pandas as pd diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index f90fd6c3e8a3e..f78b9b8ced863 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -218,7 +218,7 @@ def private_import_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]] tree = ast.parse(contents) for node in ast.walk(tree): - if not (isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom)): + if not isinstance(node, (ast.Import, ast.ImportFrom)): continue for module in node.names: From cca36f7492a25c2c50c6731698316ab729c54ce0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 1 Jan 2023 12:12:57 +0000 Subject: [PATCH 05/13] use typing.Callable --- pandas/core/strings/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 6cce14de61637..c96e5a1abcf86 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,10 +1,10 @@ from __future__ import annotations import abc -import collections import re from typing import ( TYPE_CHECKING, + Callable, Literal, ) @@ -69,7 +69,7 @@ def _str_endswith(self, pat, na=None): def _str_replace( self, pat: str | re.Pattern, - repl: str | collections.abc.Callable, + repl: str | Callable, n: int = -1, case: bool = True, flags: int = 0, From ba2533db09b4ecd818d2061d9d015a76ea202ca0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 1 Jan 2023 12:43:43 +0000 Subject: [PATCH 06/13] reduce diff --- .pre-commit-config.yaml | 38 ++++++++++++++++++++++ ci/code_checks.sh | 71 ++--------------------------------------- 2 files changed, 40 insertions(+), 69 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a47503de3c227..b4f8d04072c79 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -227,6 +227,15 @@ repos: # Check for deprecated messages without sphinx directive |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) + + # {foo!r} instead of {repr(foo)} + |!r} + + # builtin filter function + |(?obj` as opposed to ` obj` + [a-zA-Z0-9*]> + types: [cython] - id: pip-to-conda name: Generate pip dependency from conda language: python diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 228cc5518495e..3c1362b1ac83e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -8,16 +8,14 @@ # # Usage: # $ ./ci/code_checks.sh # run all checks -# $ ./ci/code_checks.sh lint # run linting only -# $ ./ci/code_checks.sh patterns # check for patterns that should not exist # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors # $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free # $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks -[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ - { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } +[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 @@ -40,71 +38,6 @@ if [[ "$GITHUB_ACTIONS" == "true" ]]; then INVGREP_PREPEND="##[error]" fi -### LINTING ### -if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then - - # Check that cython casting is of the form `obj` as opposed to ` obj`; - # it doesn't make a difference, but we want to be internally consistent. - # Note: this grep pattern is (intended to be) equivalent to the python - # regex r'(?])> ' - MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG - invgrep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # readability/casting: Warnings about C casting instead of C++ casting - # runtime/int: Warnings about using C number types instead of C++ ones - # build/include_subdir: Warnings about prefacing included header files with directory - -fi - -### PATTERNS ### -if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then - - MSG='Check for use of exec' ; echo $MSG - invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for pytest warns' ; echo $MSG - invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for pytest raises without context' ; echo $MSG - invgrep -r -E --include '*.py' "[[:space:]] pytest.raises" pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of builtin filter function' ; echo $MSG - invgrep -R --include="*.py" -P '(? Date: Sun, 1 Jan 2023 12:46:25 +0000 Subject: [PATCH 07/13] further reduce --- pandas/core/strings/object_array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index fb2478bfbe44e..508ac122d67af 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,12 +1,12 @@ from __future__ import annotations -from collections import abc import functools import re import sys import textwrap from typing import ( TYPE_CHECKING, + Callable, Literal, ) import unicodedata @@ -153,7 +153,7 @@ def _str_endswith(self, pat, na=None): def _str_replace( self, pat: str | re.Pattern, - repl: str | abc.Callable, + repl: str | Callable, n: int = -1, case: bool = True, flags: int = 0, From 098f988ea1b8a6347edc83c5645653946c2bbd10 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 1 Jan 2023 12:51:20 +0000 Subject: [PATCH 08/13] one more --- pandas/core/arrays/string_arrow.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 40d3bcfe7bf12..fb081d0e63c96 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,8 +1,10 @@ from __future__ import annotations -from collections import abc import re -from typing import Union +from typing import ( + Callable, + Union, +) import numpy as np @@ -302,7 +304,7 @@ def _str_endswith(self, pat: str, na=None): def _str_replace( self, pat: str | re.Pattern, - repl: str | abc.Callable, + repl: str | Callable, n: int = -1, case: bool = True, flags: int = 0, From 4826c2a2d80cbdffc9dca3d03bdfc0486c8d30bc Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 1 Jan 2023 13:04:39 +0000 Subject: [PATCH 09/13] fixup regex --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b4f8d04072c79..10d6fdab5db39 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -318,7 +318,7 @@ repos: entry: | (?x) # `obj` as opposed to ` obj` - [a-zA-Z0-9*]> + [a-zA-Z0-9*]>\s types: [cython] - id: pip-to-conda name: Generate pip dependency from conda From 87d460076351a366550a1e52279aea6cc92a4dbc Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 1 Jan 2023 13:07:21 +0000 Subject: [PATCH 10/13] fixup regex --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 10d6fdab5db39..dab3a98f2253b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -318,7 +318,7 @@ repos: entry: | (?x) # `obj` as opposed to ` obj` - [a-zA-Z0-9*]>\s + [a-zA-Z0-9*]>[ ] types: [cython] - id: pip-to-conda name: Generate pip dependency from conda From 75626216290070ea6b6ef4d5b2a55bf5b7a8fc72 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 3 Jan 2023 18:32:23 +0000 Subject: [PATCH 11/13] add xfail to request --- pandas/tests/io/pytables/test_round_trip.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index fdae2d47b6389..00f0363d7fe92 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -349,7 +349,7 @@ def test_index_types(setup_path): _check_roundtrip(ser, func, path=setup_path) -def test_timeseries_preepoch(setup_path): +def test_timeseries_preepoch(setup_path, request): dr = bdate_range("1/1/1940", "1/1/1960") ts = Series(np.random.randn(len(dr)), index=dr) @@ -357,7 +357,9 @@ def test_timeseries_preepoch(setup_path): _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) except OverflowError: if is_platform_windows(): - pytest.mark.xfail("known failure on some windows platforms") + request.node.add_marker( + pytest.mark.xfail("known failure on some windows platforms") + ) else: raise From c86353b2840e0bd24eba6a744a24354e67826f67 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 3 Jan 2023 20:04:00 +0000 Subject: [PATCH 12/13] remove else after xfail mark --- pandas/tests/io/pytables/test_round_trip.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 00f0363d7fe92..5c7c4f9ce0b75 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -360,8 +360,7 @@ def test_timeseries_preepoch(setup_path, request): request.node.add_marker( pytest.mark.xfail("known failure on some windows platforms") ) - else: - raise + raise @pytest.mark.parametrize( From 084efe021dc516073d01ed3545aad52e8c896e7a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 4 Jan 2023 16:54:02 +0000 Subject: [PATCH 13/13] make copy_on_write public --- pandas/core/generic.py | 4 ++-- pandas/core/internals/managers.py | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 51f3a4eae6ade..21b2907bbf2ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -159,7 +159,7 @@ SingleArrayManager, ) from pandas.core.internals.construction import mgr_to_mgr -from pandas.core.internals.managers import _using_copy_on_write +from pandas.core.internals.managers import using_copy_on_write from pandas.core.missing import ( clean_fill_method, clean_reindex_fill_method, @@ -10102,7 +10102,7 @@ def truncate( if isinstance(ax, MultiIndex): setattr(result, self._get_axis_name(axis), ax.truncate(before, after)) - if copy or (copy is None and not _using_copy_on_write()): + if copy or (copy is None and not using_copy_on_write()): result = result.copy(deep=copy) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c70a7e29bf3ce..c4e869a3f6a45 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -376,7 +376,7 @@ def setitem(self: T, indexer, value) -> T: if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") - if _using_copy_on_write() and not self._has_no_reference(0): + if using_copy_on_write() and not self._has_no_reference(0): # if being referenced -> perform Copy-on-Write and clear the reference # this method is only called if there is a single block -> hardcoded 0 self = self.copy() @@ -385,7 +385,7 @@ def setitem(self: T, indexer, value) -> T: def putmask(self, mask, new, align: bool = True): if ( - _using_copy_on_write() + using_copy_on_write() and self.refs is not None and not all(ref is None for ref in self.refs) ): @@ -429,7 +429,7 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: limit = libalgos.validate_limit(None, limit=limit) if inplace: # TODO(CoW) can be optimized to only copy those blocks that have refs - if _using_copy_on_write() and any( + if using_copy_on_write() and any( not self._has_no_reference_block(i) for i in range(len(self.blocks)) ): self = self.copy() @@ -613,7 +613,7 @@ def copy(self: T, deep: bool | None | Literal["all"] = True) -> T: BlockManager """ if deep is None: - if _using_copy_on_write(): + if using_copy_on_write(): # use shallow copy deep = False else: @@ -701,7 +701,7 @@ def reindex_indexer( pandas-indexer with -1's only. """ if copy is None: - if _using_copy_on_write(): + if using_copy_on_write(): # use shallow copy copy = False else: @@ -879,7 +879,7 @@ def _slice_take_blocks_ax0( # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice or _using_copy_on_write(): + if only_slice or using_copy_on_write(): taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): @@ -1039,7 +1039,7 @@ def from_blocks( """ Constructor for BlockManager and SingleBlockManager with same signature. """ - parent = parent if _using_copy_on_write() else None + parent = parent if using_copy_on_write() else None return cls(blocks, axes, refs, parent, verify_integrity=False) # ---------------------------------------------------------------- @@ -1222,7 +1222,7 @@ def value_getitem(placement): blk_locs = blklocs[val_locs.indexer] if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write - if _using_copy_on_write() and not self._has_no_reference_block(blkno_l): + if using_copy_on_write() and not self._has_no_reference_block(blkno_l): blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) self._clear_reference_block(blkno_l) else: @@ -1320,7 +1320,7 @@ def _iset_single( if inplace and blk.should_store(value): copy = False - if _using_copy_on_write() and not self._has_no_reference_block(blkno): + if using_copy_on_write() and not self._has_no_reference_block(blkno): # perform Copy-on-Write and clear the reference copy = True self._clear_reference_block(blkno) @@ -1344,7 +1344,7 @@ def column_setitem( This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ - if _using_copy_on_write() and not self._has_no_reference(loc): + if using_copy_on_write() and not self._has_no_reference(loc): # otherwise perform Copy-on-Write and clear the reference blkno = self.blknos[loc] blocks = list(self.blocks) @@ -1839,7 +1839,7 @@ def __init__( self.axes = [axis] self.blocks = (block,) self.refs = refs - self.parent = parent if _using_copy_on_write() else None + self.parent = parent if using_copy_on_write() else None @classmethod def from_blocks( @@ -1876,7 +1876,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: new_blk = type(blk)(arr, placement=bp, ndim=2) axes = [columns, self.axes[0]] refs: list[weakref.ref | None] = [weakref.ref(blk)] - parent = self if _using_copy_on_write() else None + parent = self if using_copy_on_write() else None return BlockManager( [new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False ) @@ -2020,7 +2020,7 @@ def setitem_inplace(self, indexer, value) -> None: in place, not returning a new Manager (and Block), and thus never changing the dtype. """ - if _using_copy_on_write() and not self._has_no_reference(0): + if using_copy_on_write() and not self._has_no_reference(0): self.blocks = (self._block.copy(),) self.refs = None self.parent = None @@ -2357,5 +2357,5 @@ def _preprocess_slice_or_indexer( _mode_options = config._global_config["mode"] -def _using_copy_on_write(): +def using_copy_on_write(): return _mode_options["copy_on_write"]