From d98c6fd6694f12e914c4869345ab4fc4c8942315 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 19 Dec 2020 22:22:12 +0100 Subject: [PATCH 01/16] ENH: Raise ParserWarning when length of names does not match length of data --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/parsers.pyx | 9 ++++++++- pandas/io/parsers.py | 8 ++++++++ pandas/tests/io/parser/test_common.py | 10 +++++++--- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7671962018144..98bfa5e912a52 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -41,6 +41,7 @@ Other enhancements - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) - Improve error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) +- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when usecols is not specified (:issue:`21768`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4995252d7aafd..fe932de95cdce 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -729,7 +729,9 @@ cdef class TextReader: field_count = max(field_count, len(self.names)) passed_count = len(header[0]) - + print(self.allow_leading_cols) + print(passed_count) + print(field_count) if (self.has_usecols and self.allow_leading_cols and not callable(self.usecols)): nuse = len(self.usecols) @@ -743,6 +745,11 @@ cdef class TextReader: # oh boy, #2442, #2981 elif self.allow_leading_cols and passed_count < field_count: self.leading_cols = field_count - passed_count + elif not self.allow_leading_cols and passed_count < field_count: + warnings.warn( + "Length of header or names does not match length of data. This leads " + "to a loss of data with index_col=False.", ParserWarning, stacklevel=6, + ) return header, field_count, unnamed_cols diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d670821c98520..7bb12d9f37b83 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2505,6 +2505,14 @@ def _exclude_implicit_index(self, alldata): if self._col_indices is not None and len(names) != len(self._col_indices): names = [names[i] for i in sorted(self._col_indices)] + if not self.index_col and len(names) != len(alldata) and names: + warnings.warn( + "Length of header or names does not match length of data. This leads " + "to a loss of data with index_col=False.", + ParserWarning, + stacklevel=6, + ) + return {name: alldata[i + offset] for i, name in enumerate(names)}, names # legacy diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index df20db0c7ee84..29e0d3b15d16e 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,7 +15,7 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.errors import DtypeWarning, EmptyDataError, ParserError +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context @@ -1062,6 +1062,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Lenght of header:pandas.errors.ParserWarning") def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1069,7 +1070,9 @@ def test_trailing_delimiters(all_parsers): 4,5,6, 7,8,9,""" parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=False) + + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv(StringIO(data), index_col=False) expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) tm.assert_frame_equal(result, expected) @@ -2178,7 +2181,8 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) From 26b07b2db2413c28ce7410625ed883fe850dc8ff Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 19 Dec 2020 22:31:51 +0100 Subject: [PATCH 02/16] Fix bugs from strg+z --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/_libs/parsers.pyx | 4 +--- pandas/tests/io/parser/test_common.py | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 98bfa5e912a52..f647a1acc357e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -41,7 +41,7 @@ Other enhancements - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) - Improve error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) -- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when usecols is not specified (:issue:`21768`) +- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index fe932de95cdce..18a188261d750 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -729,9 +729,7 @@ cdef class TextReader: field_count = max(field_count, len(self.names)) passed_count = len(header[0]) - print(self.allow_leading_cols) - print(passed_count) - print(field_count) + if (self.has_usecols and self.allow_leading_cols and not callable(self.usecols)): nuse = len(self.usecols) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 29e0d3b15d16e..97ecce07e80c5 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1062,7 +1062,6 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.filterwarnings("ignore:Lenght of header:pandas.errors.ParserWarning") def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C From 7dd3f1b1e1c7baa6a4cdbdab44542a261f31aa21 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 19 Dec 2020 22:33:01 +0100 Subject: [PATCH 03/16] Refactor code --- pandas/_libs/parsers.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 18a188261d750..3e0bcf5a7b0b5 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -745,8 +745,10 @@ cdef class TextReader: self.leading_cols = field_count - passed_count elif not self.allow_leading_cols and passed_count < field_count: warnings.warn( - "Length of header or names does not match length of data. This leads " - "to a loss of data with index_col=False.", ParserWarning, stacklevel=6, + "Length of header or names does not match length of data. This " + "leads to a loss of data with index_col=False.", + ParserWarning, + stacklevel=6, ) return header, field_count, unnamed_cols From 70d5c1c0834811529c75de9da834f6d40913dea1 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 19 Dec 2020 22:33:55 +0100 Subject: [PATCH 04/16] Refactor if else --- pandas/_libs/parsers.pyx | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3e0bcf5a7b0b5..6bbfcb9c31ee4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -741,15 +741,16 @@ cdef class TextReader: raise ValueError('Number of passed names did not match number of ' 'header fields in the file') # oh boy, #2442, #2981 - elif self.allow_leading_cols and passed_count < field_count: - self.leading_cols = field_count - passed_count - elif not self.allow_leading_cols and passed_count < field_count: - warnings.warn( - "Length of header or names does not match length of data. This " - "leads to a loss of data with index_col=False.", - ParserWarning, - stacklevel=6, - ) + if passed_count < field_count: + if self.allow_leading_cols: + self.leading_cols = field_count - passed_count + else: + warnings.warn( + "Length of header or names does not match length of data. This " + "leads to a loss of data with index_col=False.", + ParserWarning, + stacklevel=6, + ) return header, field_count, unnamed_cols From 76abd33d662d03011b76919ce5933dbe32e138bc Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 19 Dec 2020 23:16:44 +0100 Subject: [PATCH 05/16] Add okwarning --- doc/source/user_guide/io.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b04abf512fbeb..aae9fd73cc361 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -753,6 +753,7 @@ the end of each data line, confusing the parser. To explicitly disable the index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python + :okwarning: data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) From 5b688f7366e13ddb3cea29bc5912d6b3185b2616 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 3 Jan 2021 20:59:08 +0100 Subject: [PATCH 06/16] Allow trailing commas --- pandas/_libs/parsers.pyx | 12 ++-------- pandas/io/parsers.py | 32 +++++++++++++++++++++------ pandas/tests/io/parser/test_common.py | 4 +--- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6bbfcb9c31ee4..4995252d7aafd 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -741,16 +741,8 @@ cdef class TextReader: raise ValueError('Number of passed names did not match number of ' 'header fields in the file') # oh boy, #2442, #2981 - if passed_count < field_count: - if self.allow_leading_cols: - self.leading_cols = field_count - passed_count - else: - warnings.warn( - "Length of header or names does not match length of data. This " - "leads to a loss of data with index_col=False.", - ParserWarning, - stacklevel=6, - ) + elif self.allow_leading_cols and passed_count < field_count: + self.leading_cols = field_count - passed_count return header, field_count, unnamed_cols diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 17e6b41ba4052..c139869a07a46 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1844,6 +1844,28 @@ def _do_date_conversions(self, names, data): return names, data + def _check_data_length(self, columns: List[str], data: List[np.ndarray]): + """Checks if length of data is equal to length of column names. One set of + trailing commas is allowed. + + Parameters + ---------- + columns: list of column names + data: list of array-likes containing the data column-wise + + """ + if not self.index_col and len(columns) != len(data) and columns: + if len(columns) == len(data) - 1 and np.all( + (data[-1] == "") | isna(data[-1]) + ): + return + warnings.warn( + "Length of header or names does not match length of data. This leads " + "to a loss of data with index_col=False.", + ParserWarning, + stacklevel=6, + ) + class CParserWrapper(ParserBase): def __init__(self, src: FilePathOrBuffer, **kwds): @@ -2128,6 +2150,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] + if self.usecols is None: + self._check_data_length(names, alldata) data = {k: v for k, (i, v) in zip(names, data)} @@ -2511,13 +2535,7 @@ def _exclude_implicit_index(self, alldata): if self._col_indices is not None and len(names) != len(self._col_indices): names = [names[i] for i in sorted(self._col_indices)] - if not self.index_col and len(names) != len(alldata) and names: - warnings.warn( - "Length of header or names does not match length of data. This leads " - "to a loss of data with index_col=False.", - ParserWarning, - stacklevel=6, - ) + self._check_data_length(names, alldata) return {name: alldata[i + offset] for i, name in enumerate(names)}, names diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b4032aec737a8..594440890bd7a 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1071,9 +1071,7 @@ def test_trailing_delimiters(all_parsers): 7,8,9,""" parser = all_parsers - with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv(StringIO(data), index_col=False) - + result = parser.read_csv(StringIO(data), index_col=False) expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) tm.assert_frame_equal(result, expected) From 56cdd189f15ed22da60bb632b983eb795e9e2d28 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 01:11:02 +0100 Subject: [PATCH 07/16] Fix dtype bug --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c139869a07a46..59485b8d4a0eb 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1856,7 +1856,7 @@ def _check_data_length(self, columns: List[str], data: List[np.ndarray]): """ if not self.index_col and len(columns) != len(data) and columns: if len(columns) == len(data) - 1 and np.all( - (data[-1] == "") | isna(data[-1]) + (np.isin(data[-1], [""])) | isna(data[-1]) ): return warnings.warn( From ac15a3086c5b1110c96a71a8aa93402607656540 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 14:58:49 +0100 Subject: [PATCH 08/16] Fix npdev bug --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 59485b8d4a0eb..8eef5ba6bc48b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1856,7 +1856,7 @@ def _check_data_length(self, columns: List[str], data: List[np.ndarray]): """ if not self.index_col and len(columns) != len(data) and columns: if len(columns) == len(data) - 1 and np.all( - (np.isin(data[-1], [""])) | isna(data[-1]) + (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1]) ): return warnings.warn( From 387b5fa1484575c543b69d038bd0fd8af63722c6 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 15:00:45 +0100 Subject: [PATCH 09/16] Add missing init file --- pandas/tests/io/parser/common/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/io/parser/common/__init__.py diff --git a/pandas/tests/io/parser/common/__init__.py b/pandas/tests/io/parser/common/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 53cac930a63d084c3088ca46cb0dc2968f7579a1 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 15:01:25 +0100 Subject: [PATCH 10/16] Remove empty file --- pandas/tests/io/parser/test_common.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 pandas/tests/io/parser/test_common.py diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 From 5d142febef76ee2cc16febcdd58693adc568388a Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 15:02:42 +0100 Subject: [PATCH 11/16] Add warning --- pandas/tests/io/parser/common/test_common_basic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 4fd754bf79ba2..fa53215cf11eb 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -11,7 +11,7 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.errors import EmptyDataError, ParserError +from pandas.errors import EmptyDataError, ParserError, ParserWarning from pandas import DataFrame, Index, Series, compat import pandas._testing as tm @@ -660,7 +660,8 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) From b21b7955ef48085a7a284b7578a6192b0b754377 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 19 Feb 2021 21:01:09 +0100 Subject: [PATCH 12/16] Merge master --- pandas/io/parsers.py | 0 pandas/io/parsers/base_parser.py | 20 ++++++++++++++++++++ pandas/io/parsers/c_parser_wrapper.py | 2 ++ pandas/io/parsers/python_parser.py | 2 ++ 4 files changed, 24 insertions(+) delete mode 100644 pandas/io/parsers.py diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2d17978b60327..23ac2e7e4db61 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -767,6 +767,26 @@ def _do_date_conversions(self, names, data): return names, data + def _check_data_length(self, columns: List[str], data: List[np.ndarray]): + """Checks if length of data is equal to length of column names. One set of + trailing commas is allowed. + Parameters + ---------- + columns: list of column names + data: list of array-likes containing the data column-wise + """ + if not self.index_col and len(columns) != len(data) and columns: + if len(columns) == len(data) - 1 and np.all( + (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1]) + ): + return + warnings.warn( + "Length of header or names does not match length of data. This leads " + "to a loss of data with index_col=False.", + ParserWarning, + stacklevel=6, + ) + def _evaluate_usecols(self, usecols, names): """ Check whether or not the 'usecols' parameter diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 135e093cdc1e0..4dea9e60238c6 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -234,6 +234,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] + if self.usecols is None: + self._check_data_length(names, alldata) data = {k: v for k, (i, v) in zip(names, data)} diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 37f553c724c9e..dba9a8a9eccdf 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -292,6 +292,8 @@ def _exclude_implicit_index(self, alldata): if self._col_indices is not None and len(names) != len(self._col_indices): names = [names[i] for i in sorted(self._col_indices)] + self._check_data_length(names, alldata) + return {name: alldata[i + offset] for i, name in enumerate(names)}, names # legacy From eb771576132be42228ae6a6e677edc8ff111e698 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 21 Apr 2021 00:10:23 +0200 Subject: [PATCH 13/16] Fix typing --- pandas/io/parsers/base_parser.py | 3 ++- pandas/tests/io/parser/common/test_chunksize.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 9911233a14f65..ec729cce550e3 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -25,6 +25,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import ( + ArrayLike, DtypeArg, FilePathOrBuffer, ) @@ -778,7 +779,7 @@ def _do_date_conversions(self, names, data): return names, data - def _check_data_length(self, columns: List[str], data: List[np.ndarray]): + def _check_data_length(self, columns: List[str], data: List[ArrayLike]): """Checks if length of data is equal to length of column names. One set of trailing commas is allowed. Parameters diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 6d5aeaa713687..4bc3f3c38f506 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -143,10 +143,7 @@ def test_read_chunksize_jagged_names(all_parsers): parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) - # error: List item 0 has incompatible type "float"; expected "int" - expected = DataFrame( - [[0] + [np.nan] * 9] * 7 + [[0] * 10] # type: ignore[list-item] - ) + expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: result = concat(reader) tm.assert_frame_equal(result, expected) From 16faf35a92be6e3825c386e8ab5d6bd4df11839a Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 14 May 2021 23:56:26 +0200 Subject: [PATCH 14/16] Change test --- pandas/tests/io/parser/usecols/test_usecols_basic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index b86dc5ef85fc6..16649be5b8a58 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -383,7 +383,9 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): a,b 1,2 """ - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) expected = DataFrame({"a": [1], "b": [None]}) if names is None and parser.engine == "python": From 4b3f63a763d5cb9cded5aa004c57dba910b630f4 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 15 May 2021 00:40:48 +0200 Subject: [PATCH 15/16] Remove warning --- doc/source/user_guide/io.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a89ae5a38d8a5..7f0cd613726dc 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -757,7 +757,6 @@ the end of each data line, confusing the parser. To explicitly disable the index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - :okwarning: data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) From fa6fed09e1d472dc8ddf9962e75f7e594b47a200 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 24 May 2021 00:43:09 +0200 Subject: [PATCH 16/16] Adress comments --- pandas/io/parsers/base_parser.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index bf955a724bb21..6031d84d2b8ec 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -793,13 +793,16 @@ def _do_date_conversions(self, names, data): return names, data - def _check_data_length(self, columns: List[str], data: List[ArrayLike]): - """Checks if length of data is equal to length of column names. One set of - trailing commas is allowed. + def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: + """Checks if length of data is equal to length of column names. + + One set of trailing commas is allowed. self.index_col not False + results in a ParserError previously when lengths do not match. + Parameters ---------- columns: list of column names - data: list of array-likes containing the data column-wise + data: list of array-likes containing the data column-wise. """ if not self.index_col and len(columns) != len(data) and columns: if len(columns) == len(data) - 1 and np.all(