Skip to content

Backport PR #38587 on branch 1.3.x (ENH: Raise ParserWarning when length of names does not match length of data) #42047

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ Other enhancements
- Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`)
- Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods <api.window>` (:issue:`15969`)
- :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`)
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
- Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)
Expand Down
24 changes: 24 additions & 0 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pandas._libs.parsers import STR_NA_VALUES
from pandas._libs.tslibs import parsing
from pandas._typing import (
ArrayLike,
DtypeArg,
FilePathOrBuffer,
final,
Expand Down Expand Up @@ -803,6 +804,29 @@ def _do_date_conversions(self, names, data):

return names, data

def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None:
"""Checks if length of data is equal to length of column names.

One set of trailing commas is allowed. self.index_col not False
results in a ParserError previously when lengths do not match.

Parameters
----------
columns: list of column names
data: list of array-likes containing the data column-wise.
"""
if not self.index_col and len(columns) != len(data) and columns:
if len(columns) == len(data) - 1 and np.all(
(is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1])
):
return
warnings.warn(
"Length of header or names does not match length of data. This leads "
"to a loss of data with index_col=False.",
ParserWarning,
stacklevel=6,
)

def _evaluate_usecols(self, usecols, names):
"""
Check whether or not the 'usecols' parameter
Expand Down
2 changes: 2 additions & 0 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ def read(self, nrows=None):

# columns as list
alldata = [x[1] for x in data_tups]
if self.usecols is None:
self._check_data_length(names, alldata)

data = {k: v for k, (i, v) in zip(names, data_tups)}

Expand Down
2 changes: 2 additions & 0 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,8 @@ def _exclude_implicit_index(self, alldata):
offset = len(self.index_col) # type: ignore[has-type]

len_alldata = len(alldata)
self._check_data_length(names, alldata)

return {
name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata
}, names
Expand Down
Empty file.
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,7 @@ def test_read_chunksize_jagged_names(all_parsers):
parser = all_parsers
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])

# error: List item 0 has incompatible type "float"; expected "int"
expected = DataFrame(
[[0] + [np.nan] * 9] * 7 + [[0] * 10] # type: ignore[list-item]
)
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
result = concat(reader)
tm.assert_frame_equal(result, expected)
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)

from pandas import (
Expand Down Expand Up @@ -685,7 +686,8 @@ def test_no_header_two_extra_columns(all_parsers):
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
stream = StringIO("foo,bar,baz,bam,blah")
parser = all_parsers
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
with tm.assert_produces_warning(ParserWarning):
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
tm.assert_frame_equal(df, ref)


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,9 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
a,b
1,2
"""
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
):
result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
expected = DataFrame({"a": [1], "b": [None]})
if names is None and parser.engine == "python":
Expand Down