diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 2545407ce43c9..e89b0d3ce2b75 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -143,3 +143,5 @@ Bug Fixes - Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`) + +- Bug in ``read_csv`` with duplicated columns and ``usecols`` (:issue:`11823`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fa9a5cf12570d..238773adae867 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3,7 +3,7 @@ """ from __future__ import print_function from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map -from pandas import compat +from pandas import compat, unique from collections import defaultdict import re import csv @@ -1788,12 +1788,8 @@ def _handle_usecols(self, columns, usecols_key): if len(columns) > 1: raise ValueError("If using multiple headers, usecols must " "be integers.") - col_indices = [] - for u in self.usecols: - if isinstance(u, string_types): - col_indices.append(usecols_key.index(u)) - else: - col_indices.append(u) + col_indices = Index(usecols_key).get_indexer_for( + unique(self.usecols)) else: col_indices = self.usecols diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 700ec3387d459..fd2f502392a30 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2342,6 +2342,20 @@ def test_usecols(self): expected = expected[['a', 'b']] tm.assert_frame_equal(result, expected) + # 11823: usecols vs no usecols + result = self.read_csv(StringIO(data), names=['a', 'a', 'b'], + header=None, usecols=['a', 'a', 'b']) + expected = self.read_csv(StringIO(data), names=['a', 'a', 'b'], + header=None) + tm.assert_frame_equal(result, expected) + + # 11823: c vs python engine + result_c = pd.read_csv(StringIO(data), engine='c', header=None, + names=['a', 'a', 'b'], usecols=['a','a','b']) + result_py = pd.read_csv(StringIO(data), engine='python', header=None, + names=['a', 'a', 'b'], usecols=['a','a','b']) + tm.assert_frame_equal(result_c, result_py) + # length conflict, passed names and usecols disagree self.assertRaises(ValueError, self.read_csv, StringIO(data), names=['a', 'b'], usecols=[1], header=None) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index f9b8d921f02d1..0d4c400f4e582 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -280,7 +280,8 @@ cdef class TextReader: object compression object mangle_dupe_cols object tupleize_cols - set noconvert, usecols + set noconvert + list usecols def __cinit__(self, source, delimiter=b',', @@ -409,7 +410,8 @@ cdef class TextReader: # suboptimal if usecols is not None: self.has_usecols = 1 - self.usecols = set(usecols) + self.usecols = list(usecols) + #self.usecols = set(usecols) # XXX if skip_footer > 0: