From 83caa3b3852f92003f111bfa56859e28a871c10c Mon Sep 17 00:00:00 2001
From: gfyoung <gfyoung17@gmail.com>
Date: Wed, 2 Mar 2016 14:59:05 +0000
Subject: [PATCH 1/2] BUG: Fix parse_dates processing with usecols and C engine

Fixes bug in processing 'parse_dates' with the C engine
in which the wrong indices (those of the filtered column
names) were being used to determine the date columns to
not be dtype-parsed by the C engine. The correct indices
are those of the original (unfiltered) column names, as
they are used later on in the actual data processing.

Closes gh-9755.
---
 doc/source/whatsnew/v0.18.1.txt |  6 +++
 pandas/io/parsers.py            | 41 ++++++++++-------
 pandas/io/tests/test_parsers.py | 78 +++++++++++++++++++++++++++++++--
 3 files changed, 105 insertions(+), 20 deletions(-)

diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
index ecb3ff5139ad0..74522b80bf047 100644
--- a/doc/source/whatsnew/v0.18.1.txt
+++ b/doc/source/whatsnew/v0.18.1.txt
@@ -236,3 +236,9 @@ Bug Fixes
 - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
 - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
 - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
+
+
+
+
+
+- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 7bd8a593661c5..5223b00c17990 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1157,18 +1157,21 @@ def __init__(self, src, **kwds):
             else:
                 self.names = lrange(self._reader.table_width)
 
-        # If the names were inferred (not passed by user) and usedcols is
-        # defined, then ensure names refers to the used columns, not the
-        # document's columns.
-        if self.usecols and passed_names:
-            col_indices = []
-            for u in self.usecols:
-                if isinstance(u, string_types):
-                    col_indices.append(self.names.index(u))
-                else:
-                    col_indices.append(u)
-            self.names = [n for i, n in enumerate(self.names)
-                          if i in col_indices]
+        # gh-9755
+        #
+        # need to set orig_names here first
+        # so that proper indexing can be done
+        # with _set_noconvert_columns
+        #
+        # once names has been filtered, we will
+        # then set orig_names again to names
+        self.orig_names = self.names[:]
+
+        if self.usecols:
+            if len(self.names) > len(self.usecols):
+                self.names = [n for i, n in enumerate(self.names)
+                              if (i in self.usecols or n in self.usecols)]
+
             if len(self.names) < len(self.usecols):
                 raise ValueError("Usecols do not match names.")
 
@@ -1194,13 +1197,17 @@ def __init__(self, src, **kwds):
         self._implicit_index = self._reader.leading_cols > 0
 
     def _set_noconvert_columns(self):
-        names = self.names
+        names = self.orig_names
+        usecols = self.usecols
 
         def _set(x):
-            if com.is_integer(x):
-                self._reader.set_noconvert(x)
-            else:
-                self._reader.set_noconvert(names.index(x))
+            if usecols and com.is_integer(x):
+                x = list(usecols)[x]
+
+            if not com.is_integer(x):
+                x = names.index(x)
+
+            self._reader.set_noconvert(x)
 
         if isinstance(self.parse_dates, list):
             for val in self.parse_dates:
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 7f523cf3aa54d..0af329b75079f 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -2682,12 +2682,84 @@ def test_uneven_lines_with_usecols(self):
         df = self.read_csv(StringIO(csv), usecols=usecols)
         tm.assert_frame_equal(df, expected)
 
-        usecols = ['a', 1]
+        usecols = ['a', 'b']
         df = self.read_csv(StringIO(csv), usecols=usecols)
         tm.assert_frame_equal(df, expected)
 
-        usecols = ['a', 'b']
-        df = self.read_csv(StringIO(csv), usecols=usecols)
+    def test_usecols_with_parse_dates(self):
+        # See gh-9755
+        s = """a,b,c,d,e
+        0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = read_csv(StringIO(s), usecols=[0, 2, 3],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = read_csv(StringIO(s), usecols=[3, 0, 2],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_full_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('abcde')
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[0, 2, 3],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[3, 0, 2],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_usecol_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('acd')
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[0, 2, 3],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[3, 0, 2],
+                      parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
 

From f0543a4f37850b0df288be4c7f2b6e8bb502d685 Mon Sep 17 00:00:00 2001
From: gfyoung <gfyoung17@gmail.com>
Date: Mon, 21 Mar 2016 20:49:10 +0000
Subject: [PATCH 2/2] BUG: Prevent mixed-typed usecols

Enforces the fact that 'usecols' must either
be all integers (indexing) or strings (column
names), as mixtures of the two are ambiguous.

Closes gh-12678.
---
 doc/source/io.rst               |  8 +++-
 doc/source/whatsnew/v0.18.1.txt |  9 +----
 pandas/io/parsers.py            | 32 ++++++++++++++--
 pandas/io/tests/test_parsers.py | 66 +++++++++++++++++++++++++--------
 4 files changed, 86 insertions(+), 29 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index a78222dd748ad..6b287a2eea532 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -120,8 +120,12 @@ index_col :  int or sequence or ``False``, default ``None``
   each line, you might consider ``index_col=False`` to force pandas to *not* use
   the first column as the index (row names).
 usecols : array-like, default ``None``
-  Return a subset of the columns. Results in much faster parsing time and lower
-  memory usage
+  Return a subset of the columns. All elements in this array must either
+  be positional (i.e. integer indices into the document columns) or strings
+  that correspond to column names provided either by the user in `names` or
+  inferred from the document header row(s). For example, a valid `usecols`
+  parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
+  results in much faster parsing time and lower memory usage.
 squeeze : boolean, default ``False``
   If the parsed data only contains one column then return a Series.
 prefix : str, default ``None``
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
index 74522b80bf047..f991be3dc3e10 100644
--- a/doc/source/whatsnew/v0.18.1.txt
+++ b/doc/source/whatsnew/v0.18.1.txt
@@ -101,7 +101,7 @@ API changes
 
 
 - ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
-
+- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`)
 - ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`)
 - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`)
 
@@ -211,6 +211,7 @@ Bug Fixes
 
 - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
 - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
+- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
 - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`).
 - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`)
 
@@ -236,9 +237,3 @@ Bug Fixes
 - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
 - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
 - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
-
-
-
-
-
-- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5223b00c17990..bd14862df4e8e 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -75,8 +75,12 @@ class ParserWarning(Warning):
     of each line, you might consider index_col=False to force pandas to _not_
     use the first column as the index (row names)
 usecols : array-like, default None
-    Return a subset of the columns.
-    Results in much faster parsing time and lower memory usage.
+    Return a subset of the columns. All elements in this array must either
+    be positional (i.e. integer indices into the document columns) or strings
+    that correspond to column names provided either by the user in `names` or
+    inferred from the document header row(s). For example, a valid `usecols`
+    parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
+    results in much faster parsing time and lower memory usage.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 prefix : str, default None
@@ -801,6 +805,26 @@ def _is_index_col(col):
     return col is not None and col is not False
 
 
+def _validate_usecols_arg(usecols):
+    """
+    Check whether or not the 'usecols' parameter
+    contains all integers (column selection by index)
+    or strings (column by name). Raises a ValueError
+    if that is not the case.
+    """
+    # gh-12678
+    if usecols is not None:
+        usecols_dtype = lib.infer_dtype(usecols)
+        if usecols_dtype not in ('integer', 'string'):
+            raise ValueError(("The elements of 'usecols' "
+                              "must either be all strings "
+                              "or all integers"))
+
+    # validation has succeeded, so
+    # return the argument for assignment
+    return usecols
+
+
 class ParserBase(object):
 
     def __init__(self, kwds):
@@ -1132,7 +1156,7 @@ def __init__(self, src, **kwds):
         self._reader = _parser.TextReader(src, **kwds)
 
         # XXX
-        self.usecols = self._reader.usecols
+        self.usecols = _validate_usecols_arg(self._reader.usecols)
 
         passed_names = self.names is None
 
@@ -1479,7 +1503,7 @@ def __init__(self, f, **kwds):
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
         self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
-        self.usecols = kwds['usecols']
+        self.usecols = _validate_usecols_arg(kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
         self.names_passed = kwds['names'] or None
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 0af329b75079f..2d56275279453 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -2702,12 +2702,12 @@ def test_usecols_with_parse_dates(self):
         }
         expected = DataFrame(cols, columns=['c_d', 'a'])
 
-        df = read_csv(StringIO(s), usecols=[0, 2, 3],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
-        df = read_csv(StringIO(s), usecols=[3, 0, 2],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
     def test_usecols_with_parse_dates_and_full_names(self):
@@ -2726,14 +2726,14 @@ def test_usecols_with_parse_dates_and_full_names(self):
         }
         expected = DataFrame(cols, columns=['c_d', 'a'])
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[0, 2, 3],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[3, 0, 2],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
     def test_usecols_with_parse_dates_and_usecol_names(self):
@@ -2752,14 +2752,48 @@ def test_usecols_with_parse_dates_and_usecol_names(self):
         }
         expected = DataFrame(cols, columns=['c_d', 'a'])
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[0, 2, 3],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
-        df = read_csv(StringIO(s), names=names,
-                      usecols=[3, 0, 2],
-                      parse_dates=parse_dates)
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_mixed_dtype_usecols(self):
+        # See gh-12678
+        data = """a,b,c
+        1000,2000,3000
+        4000,5000,6000
+        """
+        msg = ("The elements of \'usecols\' "
+               "must either be all strings "
+               "or all integers")
+        usecols = [0, 'b', 2]
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            df = self.read_csv(StringIO(data), usecols=usecols)
+
+    def test_usecols_with_integer_like_header(self):
+        data = """2,0,1
+        1000,2000,3000
+        4000,5000,6000
+        """
+
+        usecols = [0, 1]  # column selection by index
+        expected = DataFrame(data=[[1000, 2000],
+                                   [4000, 5000]],
+                             columns=['2', '0'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['0', '1']  # column selection by name
+        expected = DataFrame(data=[[2000, 3000],
+                                   [5000, 6000]],
+                             columns=['0', '1'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(df, expected)