From a98512920d9bad84c34faee94ade26aab191b3dc Mon Sep 17 00:00:00 2001 From: Grant Smith Date: Wed, 16 Nov 2016 20:35:00 -0500 Subject: [PATCH] GH14671 - ERR: Raise ValueError if usecol doesn't exist with same len - Updated tests - Updated whatsnew 0.19.2 note - Added new parameter file_header for CParserWrapper to contain the original header read from the file for comparison --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/io/parsers.py | 14 ++++++++------ pandas/io/tests/parser/usecols.py | 4 ++++ pandas/parser.pyx | 8 +++++++- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 4e2c6e2faeaa5..54d0a212f6924 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -32,6 +32,7 @@ Bug Fixes +- Bug in pd.read_csv - catch missing columns if usecols and header lengths match (:issue:`14671`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 092cba093421a..7c2db9ed39b5f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1425,12 +1425,14 @@ def __init__(self, src, **kwds): self.orig_names = self.names[:] if self.usecols: - if len(self.names) > len(self.usecols): - self.names = [n for i, n in enumerate(self.names) - if (i in self.usecols or n in self.usecols)] - - if len(self.names) < len(self.usecols): - raise ValueError("Usecols do not match names.") + if self._reader.file_header is not None: + h = self._reader.file_header[0] + usecol_len = len(set(self.usecols) - set(h)) + usecoli_len = len(set(self.usecols) - set(range(0, len(h)))) + if usecol_len > 0 and usecoli_len > 0: + raise ValueError("Usecols do not match names.") + + self.names = self._filter_usecols(self.names) self._set_noconvert_columns() diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 5051171ccb8f0..0358f1fba3d8d 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -54,6 +54,10 @@ def test_usecols(self): expected.columns = ['foo', 'bar'] tm.assert_frame_equal(result, expected) + # same length but usecols column doesn't exist - see gh-14671 + self.assertRaises(ValueError, self.read_csv, StringIO(data), + usecols=['a', 'b', 'z']) + data = """\ 1,2,3 4,5,6 diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 9fb99637731be..666ab882bba3b 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -290,7 +290,7 @@ cdef class TextReader: object na_values object memory_map object as_recarray - object header, orig_header, names, header_start, header_end + object header, orig_header, names, header_start, header_end, file_header object index_col object low_memory object skiprows @@ -775,6 +775,12 @@ cdef class TextReader: data_line = hr + 1 header.append(this_header) + self.file_header = header[:] + + #if self.usecols is not None: + # if len(set(self.usecols) - set(header[0])) > 0 and len(set(self.usecols) - set(range(0,field_count))) > 0: + # raise ValueError("Usecols do not match names.") + if self.names is not None: header = [ self.names ]