From 83caa3b3852f92003f111bfa56859e28a871c10c Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 2 Mar 2016 14:59:05 +0000 Subject: [PATCH 1/2] BUG: Fix parse_dates processing with usecols and C engine Fixes bug in processing 'parse_dates' with the C engine in which the wrong indices (those of the filtered column names) were being used to determine the date columns to not be dtype-parsed by the C engine. The correct indices are those of the original (unfiltered) column names, as they are used later on in the actual data processing. Closes gh-9755. --- doc/source/whatsnew/v0.18.1.txt | 6 +++ pandas/io/parsers.py | 41 ++++++++++------- pandas/io/tests/test_parsers.py | 78 +++++++++++++++++++++++++++++++-- 3 files changed, 105 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index ecb3ff5139ad0..74522b80bf047 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -236,3 +236,9 @@ Bug Fixes - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`) - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`) - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`) + + + + + +- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7bd8a593661c5..5223b00c17990 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1157,18 +1157,21 @@ def __init__(self, src, **kwds): else: self.names = lrange(self._reader.table_width) - # If the names were inferred (not passed by user) and usedcols is - # defined, then ensure names refers to the used columns, not the - # document's columns. - if self.usecols and passed_names: - col_indices = [] - for u in self.usecols: - if isinstance(u, string_types): - col_indices.append(self.names.index(u)) - else: - col_indices.append(u) - self.names = [n for i, n in enumerate(self.names) - if i in col_indices] + # gh-9755 + # + # need to set orig_names here first + # so that proper indexing can be done + # with _set_noconvert_columns + # + # once names has been filtered, we will + # then set orig_names again to names + self.orig_names = self.names[:] + + if self.usecols: + if len(self.names) > len(self.usecols): + self.names = [n for i, n in enumerate(self.names) + if (i in self.usecols or n in self.usecols)] + if len(self.names) < len(self.usecols): raise ValueError("Usecols do not match names.") @@ -1194,13 +1197,17 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 def _set_noconvert_columns(self): - names = self.names + names = self.orig_names + usecols = self.usecols def _set(x): - if com.is_integer(x): - self._reader.set_noconvert(x) - else: - self._reader.set_noconvert(names.index(x)) + if usecols and com.is_integer(x): + x = list(usecols)[x] + + if not com.is_integer(x): + x = names.index(x) + + self._reader.set_noconvert(x) if isinstance(self.parse_dates, list): for val in self.parse_dates: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7f523cf3aa54d..0af329b75079f 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2682,12 +2682,84 @@ def test_uneven_lines_with_usecols(self): df = self.read_csv(StringIO(csv), usecols=usecols) tm.assert_frame_equal(df, expected) - usecols = ['a', 1] + usecols = ['a', 'b'] df = self.read_csv(StringIO(csv), usecols=usecols) tm.assert_frame_equal(df, expected) - usecols = ['a', 'b'] - df = self.read_csv(StringIO(csv), usecols=usecols) + def test_usecols_with_parse_dates(self): + # See gh-9755 + s = """a,b,c,d,e + 0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + + cols = { + 'a' : [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = read_csv(StringIO(s), usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = read_csv(StringIO(s), usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_parse_dates_and_full_names(self): + # See gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + names = list('abcde') + + cols = { + 'a' : [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_parse_dates_and_usecol_names(self): + # See gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + names = list('acd') + + cols = { + 'a' : [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) tm.assert_frame_equal(df, expected) From f0543a4f37850b0df288be4c7f2b6e8bb502d685 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 21 Mar 2016 20:49:10 +0000 Subject: [PATCH 2/2] BUG: Prevent mixed-typed usecols Enforces the fact that 'usecols' must either be all integers (indexing) or strings (column names), as mixtures of the two are ambiguous. Closes gh-12678. --- doc/source/io.rst | 8 +++- doc/source/whatsnew/v0.18.1.txt | 9 +---- pandas/io/parsers.py | 32 ++++++++++++++-- pandas/io/tests/test_parsers.py | 66 +++++++++++++++++++++++++-------- 4 files changed, 86 insertions(+), 29 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index a78222dd748ad..6b287a2eea532 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -120,8 +120,12 @@ index_col : int or sequence or ``False``, default ``None`` each line, you might consider ``index_col=False`` to force pandas to *not* use the first column as the index (row names). usecols : array-like, default ``None`` - Return a subset of the columns. Results in much faster parsing time and lower - memory usage + Return a subset of the columns. All elements in this array must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid `usecols` + parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter + results in much faster parsing time and lower memory usage. squeeze : boolean, default ``False`` If the parsed data only contains one column then return a Series. prefix : str, default ``None`` diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 74522b80bf047..f991be3dc3e10 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -101,7 +101,7 @@ API changes - ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`) - +- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`) - ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`) - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`) @@ -211,6 +211,7 @@ Bug Fixes - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`) +- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`) - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`). - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`) @@ -236,9 +237,3 @@ Bug Fixes - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`) - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`) - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`) - - - - - -- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5223b00c17990..bd14862df4e8e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -75,8 +75,12 @@ class ParserWarning(Warning): of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) usecols : array-like, default None - Return a subset of the columns. - Results in much faster parsing time and lower memory usage. + Return a subset of the columns. All elements in this array must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid `usecols` + parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter + results in much faster parsing time and lower memory usage. squeeze : boolean, default False If the parsed data only contains one column then return a Series prefix : str, default None @@ -801,6 +805,26 @@ def _is_index_col(col): return col is not None and col is not False +def _validate_usecols_arg(usecols): + """ + Check whether or not the 'usecols' parameter + contains all integers (column selection by index) + or strings (column by name). Raises a ValueError + if that is not the case. + """ + # gh-12678 + if usecols is not None: + usecols_dtype = lib.infer_dtype(usecols) + if usecols_dtype not in ('integer', 'string'): + raise ValueError(("The elements of 'usecols' " + "must either be all strings " + "or all integers")) + + # validation has succeeded, so + # return the argument for assignment + return usecols + + class ParserBase(object): def __init__(self, kwds): @@ -1132,7 +1156,7 @@ def __init__(self, src, **kwds): self._reader = _parser.TextReader(src, **kwds) # XXX - self.usecols = self._reader.usecols + self.usecols = _validate_usecols_arg(self._reader.usecols) passed_names = self.names is None @@ -1479,7 +1503,7 @@ def __init__(self, f, **kwds): self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) - self.usecols = kwds['usecols'] + self.usecols = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] self.names_passed = kwds['names'] or None diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 0af329b75079f..2d56275279453 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2702,12 +2702,12 @@ def test_usecols_with_parse_dates(self): } expected = DataFrame(cols, columns=['c_d', 'a']) - df = read_csv(StringIO(s), usecols=[0, 2, 3], - parse_dates=parse_dates) + df = self.read_csv(StringIO(s), usecols=[0, 2, 3], + parse_dates=parse_dates) tm.assert_frame_equal(df, expected) - df = read_csv(StringIO(s), usecols=[3, 0, 2], - parse_dates=parse_dates) + df = self.read_csv(StringIO(s), usecols=[3, 0, 2], + parse_dates=parse_dates) tm.assert_frame_equal(df, expected) def test_usecols_with_parse_dates_and_full_names(self): @@ -2726,14 +2726,14 @@ def test_usecols_with_parse_dates_and_full_names(self): } expected = DataFrame(cols, columns=['c_d', 'a']) - df = read_csv(StringIO(s), names=names, - usecols=[0, 2, 3], - parse_dates=parse_dates) + df = self.read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) tm.assert_frame_equal(df, expected) - df = read_csv(StringIO(s), names=names, - usecols=[3, 0, 2], - parse_dates=parse_dates) + df = self.read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) tm.assert_frame_equal(df, expected) def test_usecols_with_parse_dates_and_usecol_names(self): @@ -2752,14 +2752,48 @@ def test_usecols_with_parse_dates_and_usecol_names(self): } expected = DataFrame(cols, columns=['c_d', 'a']) - df = read_csv(StringIO(s), names=names, - usecols=[0, 2, 3], - parse_dates=parse_dates) + df = self.read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) tm.assert_frame_equal(df, expected) - df = read_csv(StringIO(s), names=names, - usecols=[3, 0, 2], - parse_dates=parse_dates) + df = self.read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_mixed_dtype_usecols(self): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + msg = ("The elements of \'usecols\' " + "must either be all strings " + "or all integers") + usecols = [0, 'b', 2] + + with tm.assertRaisesRegexp(ValueError, msg): + df = self.read_csv(StringIO(data), usecols=usecols) + + def test_usecols_with_integer_like_header(self): + data = """2,0,1 + 1000,2000,3000 + 4000,5000,6000 + """ + + usecols = [0, 1] # column selection by index + expected = DataFrame(data=[[1000, 2000], + [4000, 5000]], + columns=['2', '0']) + df = self.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = ['0', '1'] # column selection by name + expected = DataFrame(data=[[2000, 3000], + [5000, 6000]], + columns=['0', '1']) + df = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(df, expected)