From 272099bea6cfacd53bfc25de9ce0d164a2ed8448 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Thu, 26 Dec 2019 18:56:33 +0200 Subject: [PATCH 1/6] CI: Add test case for unwanted patterns --- ci/code_checks.sh | 4 ++ scripts/validate_string_concatenation.py | 88 ++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100755 scripts/validate_string_concatenation.py diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 46ace2dd9d70e..8784cfaad1a8e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -216,6 +216,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include=*.{py,pyx} 'xrange' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of not concatenated strings' ; echo $MSG + python $BASE_DIR/scripts/validate_string_concatenation.py pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG INVGREP_APPEND=" <- trailing whitespaces found" invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" * diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py new file mode 100755 index 0000000000000..fc00d5861db15 --- /dev/null +++ b/scripts/validate_string_concatenation.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +""" +GH #30454 + +Check where there is a string that needs to be concatenated. + +This is necessary after black formating, +where for example black transforms this: + +>>> foo = ( +... "bar " +... "baz" +... ) + +into this: + +>>> foo = ("bar " "baz") + +Black is not considering this as an +issue (see https://github.com/psf/black/issues/1051), so we are checking +it here. +""" + +import os +import sys +import token +import tokenize + +# Can be annotated as typing.FrozenSet[str] +FILE_EXTENSIONS_TO_CHECK = frozenset((".pxd", ".py", ".pyx", ".pyx.ini")) + + +def is_concatenated(file_path): + """ + Checking if the file containing strings that needs to be concatenated. + + Parameters + ---------- + file_path : str + File path pointing to a single file. + + Returns + ------- + int + Status code representing if the file needs a fix. + 0 - All good. + 1 - Needs to be fixed. + """ + need_fix = False + with open(file_path, "r") as file_name: + tokens = list(tokenize.generate_tokens(file_name.readline)) + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token[0] == next_token[0] == token.STRING: + need_fix = True + print( + "{file_path}:{line_number}:\t{start} and {end}".format( + file_path=file_path, + line_number=current_token[2][0], + start=current_token[1], + end=next_token[1], + ) + ) + + return int(need_fix) + + +if __name__ == "__main__": + path = sys.argv[1] + + if not os.path.exists(path): + raise ValueError("Please enter a valid path, to a file/directory.") + + if os.path.isfile(path): + # Means that the given path is of a single file. + sys.exit(is_concatenated(path)) + + failures = 0 + # Means that the given path is of a directory. + for subdir, _, files in os.walk(path): + for file_name in files: + if any( + file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK + ): + file_extension = os.path.join(subdir, file_name) + failures += is_concatenated(os.path.join(subdir, file_name)) + + exit_code = 1 if failures >= 1 else 0 + sys.exit(exit_code) From 7a588bca3c8f8ac2321c4508ebf215303f8fe73f Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Thu, 26 Dec 2019 23:23:48 +0200 Subject: [PATCH 2/6] Refactored the code to use a generator --- ci/code_checks.sh | 8 +-- scripts/validate_string_concatenation.py | 70 ++++++++++++------------ 2 files changed, 38 insertions(+), 40 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8784cfaad1a8e..ae2b62f607416 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,6 +100,10 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of not concatenated strings' ; echo $MSG + $BASE_DIR/scripts/validate_string_concatenation.py pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + echo "isort --version-number" isort --version-number @@ -216,10 +220,6 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include=*.{py,pyx} 'xrange' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of not concatenated strings' ; echo $MSG - python $BASE_DIR/scripts/validate_string_concatenation.py pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG INVGREP_APPEND=" <- trailing whitespaces found" invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" * diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index fc00d5861db15..e16550661361b 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ GH #30454 @@ -12,77 +12,75 @@ ... "baz" ... ) + into this: >>> foo = ("bar " "baz") Black is not considering this as an -issue (see https://github.com/psf/black/issues/1051), so we are checking -it here. +issue (see issue https://github.com/psf/black/issues/1051), +so we are checking it here. """ import os import sys import token import tokenize +from typing import FrozenSet, Generator, List -# Can be annotated as typing.FrozenSet[str] -FILE_EXTENSIONS_TO_CHECK = frozenset((".pxd", ".py", ".pyx", ".pyx.ini")) +FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset( + (".pxd", ".py", ".pyx", ".pyx.ini") +) -def is_concatenated(file_path): +def strings_to_concatenate(file_path: str) -> Generator[str, None, None]: """ - Checking if the file containing strings that needs to be concatenated. + Yielding the strings that needs to be concatenated in a given file. Parameters ---------- file_path : str File path pointing to a single file. - Returns - ------- - int - Status code representing if the file needs a fix. - 0 - All good. - 1 - Needs to be fixed. + Yields + ------ + str + Message containing info about the string that needs to be concatenated. """ - need_fix = False with open(file_path, "r") as file_name: - tokens = list(tokenize.generate_tokens(file_name.readline)) - for current_token, next_token in zip(tokens, tokens[1:]): - if current_token[0] == next_token[0] == token.STRING: - need_fix = True - print( - "{file_path}:{line_number}:\t{start} and {end}".format( - file_path=file_path, - line_number=current_token[2][0], - start=current_token[1], - end=next_token[1], - ) - ) - - return int(need_fix) + tokens: List = list(tokenize.generate_tokens(file_name.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token[0] == next_token[0] == token.STRING: + line_number = current_token[2][0] + start = current_token[1] + end = next_token[1] + yield f"{file_path}:{line_number}:\t between {start} and {end}\n" if __name__ == "__main__": - path = sys.argv[1] + path: str = sys.argv[1] if not os.path.exists(path): raise ValueError("Please enter a valid path, to a file/directory.") + failed: bool = False + if os.path.isfile(path): - # Means that the given path is of a single file. - sys.exit(is_concatenated(path)) + for msg in strings_to_concatenate(path): + if msg: + failed = True + print(msg) - failures = 0 - # Means that the given path is of a directory. for subdir, _, files in os.walk(path): for file_name in files: if any( file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK ): file_extension = os.path.join(subdir, file_name) - failures += is_concatenated(os.path.join(subdir, file_name)) - exit_code = 1 if failures >= 1 else 0 - sys.exit(exit_code) + for msg in strings_to_concatenate(os.path.join(subdir, file_name)): + if msg: + failed = True + print(msg) + sys.exit(failed) From c19a2a5bc55697a2e9d79b51cb8492b6cbc7a958 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Sun, 29 Dec 2019 16:39:04 +0200 Subject: [PATCH 3/6] Fixes for datapythonista's review --- ci/code_checks.sh | 6 +- scripts/validate_string_concatenation.py | 116 +++++++++++++++-------- 2 files changed, 81 insertions(+), 41 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ae2b62f607416..bc58f59efacc7 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -101,7 +101,11 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for use of not concatenated strings' ; echo $MSG - $BASE_DIR/scripts/validate_string_concatenation.py pandas + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:String unnecessarily split in two by black. Please merge them manually." . + else + $BASE_DIR/scripts/validate_string_concatenation.py . + fi RET=$(($RET + $?)) ; echo $MSG "DONE" echo "isort --version-number" diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index e16550661361b..cc0922ee97db4 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """ GH #30454 @@ -12,7 +12,6 @@ ... "baz" ... ) - into this: >>> foo = ("bar " "baz") @@ -22,65 +21,102 @@ so we are checking it here. """ +import argparse import os import sys import token import tokenize -from typing import FrozenSet, Generator, List +from typing import Dict, Generator, List + +FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd") -FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset( - (".pxd", ".py", ".pyx", ".pyx.ini") -) +def main(source_path: str, output_format: str) -> bool: + """ + Main entry point of the script. + + Parameters + ---------- + source_path : str, default '.' + Source path representing path to a file/directory. + output_format : str + Output format of the script. + + Returns + ------- + bool + True if found any strings that needs to be concatenated. -def strings_to_concatenate(file_path: str) -> Generator[str, None, None]: + Raises + ------ + ValueError + If the `source_path` is not pointing to existing file/directory. + """ + if not os.path.exists(source_path): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + + is_failed: bool = False + + if os.path.isfile(source_path): + for values in strings_to_concatenate(source_path): + is_failed = True + print(output_format.format(**values)) + + for subdir, _, files in os.walk(source_path): + for file_name in files: + if any( + file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK + ): + for values in strings_to_concatenate(os.path.join(subdir, file_name)): + is_failed = True + print(output_format.format(**values)) + return is_failed + + +def strings_to_concatenate(source_path: str) -> Generator[Dict[str, str], None, None]: """ Yielding the strings that needs to be concatenated in a given file. Parameters ---------- - file_path : str + source_path : str File path pointing to a single file. Yields ------ - str - Message containing info about the string that needs to be concatenated. + dict of {str: str} + Containing: + source_path + Source file path. + line_number + Line number of unconcatenated string. """ - with open(file_path, "r") as file_name: + with open(source_path, "r") as file_name: tokens: List = list(tokenize.generate_tokens(file_name.readline)) for current_token, next_token in zip(tokens, tokens[1:]): if current_token[0] == next_token[0] == token.STRING: - line_number = current_token[2][0] - start = current_token[1] - end = next_token[1] - yield f"{file_path}:{line_number}:\t between {start} and {end}\n" + yield {"source_path": source_path, "line_number": current_token[2][0]} if __name__ == "__main__": - path: str = sys.argv[1] - - if not os.path.exists(path): - raise ValueError("Please enter a valid path, to a file/directory.") - - failed: bool = False - - if os.path.isfile(path): - for msg in strings_to_concatenate(path): - if msg: - failed = True - print(msg) - - for subdir, _, files in os.walk(path): - for file_name in files: - if any( - file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK - ): - file_extension = os.path.join(subdir, file_name) - - for msg in strings_to_concatenate(os.path.join(subdir, file_name)): - if msg: - failed = True - print(msg) - sys.exit(failed) + parser = argparse.ArgumentParser(description="Validate concatenated strings") + + parser.add_argument( + "path", nargs="?", default=".", help="Source path of file/directory to check." + ) + parser.add_argument( + "--format", + "-f", + default=( + "{source_path}:{line_number}:String unnecessarily split in two by black. " + "Please merge them manually." + ), + help="Output format of the unconcatenated strings.", + ) + + args = parser.parse_args() + + sys.exit(main(source_path=args.path, output_format=args.format)) From 2b67601cb5cd22dfac29c675be28cdcc994d508f Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Sun, 29 Dec 2019 17:14:05 +0200 Subject: [PATCH 4/6] Fixes for review --- ci/code_checks.sh | 2 +- scripts/validate_string_concatenation.py | 39 ++++++++++++++++-------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index bc58f59efacc7..dc1c8481b1712 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -102,7 +102,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then MSG='Check for use of not concatenated strings' ; echo $MSG if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:String unnecessarily split in two by black. Please merge them manually." . + $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:{msg}" . else $BASE_DIR/scripts/validate_string_concatenation.py . fi diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index cc0922ee97db4..99c0d0568b353 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -26,7 +26,7 @@ import sys import token import tokenize -from typing import Dict, Generator, List +from typing import Generator, List, Tuple FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd") @@ -60,22 +60,40 @@ def main(source_path: str, output_format: str) -> bool: is_failed: bool = False if os.path.isfile(source_path): - for values in strings_to_concatenate(source_path): + for source_path, line_number in strings_to_concatenate(source_path): is_failed = True - print(output_format.format(**values)) + msg = ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ) + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) for subdir, _, files in os.walk(source_path): for file_name in files: if any( file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK ): - for values in strings_to_concatenate(os.path.join(subdir, file_name)): + for source_path, line_number in strings_to_concatenate( + os.path.join(subdir, file_name) + ): is_failed = True - print(output_format.format(**values)) + msg = ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ) + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) return is_failed -def strings_to_concatenate(source_path: str) -> Generator[Dict[str, str], None, None]: +def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, None]: """ Yielding the strings that needs to be concatenated in a given file. @@ -86,7 +104,7 @@ def strings_to_concatenate(source_path: str) -> Generator[Dict[str, str], None, Yields ------ - dict of {str: str} + Tuple Containing: source_path Source file path. @@ -98,7 +116,7 @@ def strings_to_concatenate(source_path: str) -> Generator[Dict[str, str], None, for current_token, next_token in zip(tokens, tokens[1:]): if current_token[0] == next_token[0] == token.STRING: - yield {"source_path": source_path, "line_number": current_token[2][0]} + yield (source_path, current_token[2][0]) if __name__ == "__main__": @@ -110,10 +128,7 @@ def strings_to_concatenate(source_path: str) -> Generator[Dict[str, str], None, parser.add_argument( "--format", "-f", - default=( - "{source_path}:{line_number}:String unnecessarily split in two by black. " - "Please merge them manually." - ), + default="{source_path}:{line_number}:{msg}", help="Output format of the unconcatenated strings.", ) From 605e70c780de2e4a6cc59a89f4f9be3ced19df54 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Tue, 31 Dec 2019 13:11:14 +0200 Subject: [PATCH 5/6] Fixes for review --- scripts/validate_string_concatenation.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index 99c0d0568b353..8b7b0655db7e6 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -59,13 +59,11 @@ def main(source_path: str, output_format: str) -> bool: is_failed: bool = False + msg = "String unnecessarily split in two by black. Please merge them manually." + if os.path.isfile(source_path): for source_path, line_number in strings_to_concatenate(source_path): is_failed = True - msg = ( - "String unnecessarily split in two by black. " - "Please merge them manually." - ) print( output_format.format( source_path=source_path, line_number=line_number, msg=msg @@ -81,10 +79,6 @@ def main(source_path: str, output_format: str) -> bool: os.path.join(subdir, file_name) ): is_failed = True - msg = ( - "String unnecessarily split in two by black. " - "Please merge them manually." - ) print( output_format.format( source_path=source_path, line_number=line_number, msg=msg @@ -104,19 +98,17 @@ def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, Yields ------ - Tuple - Containing: - source_path - Source file path. - line_number - Line number of unconcatenated string. + source_path : str + Source file path. + line_number : int + Line number of unconcatenated string. """ with open(source_path, "r") as file_name: tokens: List = list(tokenize.generate_tokens(file_name.readline)) for current_token, next_token in zip(tokens, tokens[1:]): if current_token[0] == next_token[0] == token.STRING: - yield (source_path, current_token[2][0]) + yield source_path, current_token[2][0] if __name__ == "__main__": From 0530d028cabc30a8b4a36895fd39bde767134dc2 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 1 Jan 2020 14:44:54 +0200 Subject: [PATCH 6/6] Remove incorrect default value from docstring --- scripts/validate_string_concatenation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index 8b7b0655db7e6..3feeddaabe8d2 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -37,7 +37,7 @@ def main(source_path: str, output_format: str) -> bool: Parameters ---------- - source_path : str, default '.' + source_path : str Source path representing path to a file/directory. output_format : str Output format of the script.