From 960441ac44078dc05c477083662fc97b9759a78b Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 24 Sep 2016 13:03:41 -0500 Subject: [PATCH 01/23] API: add dtype= option to python parser --- pandas/io/parsers.py | 126 ++++++++++---- pandas/io/tests/parser/c_parser_only.py | 193 +-------------------- pandas/io/tests/parser/dtypes.py | 217 ++++++++++++++++++++++++ pandas/io/tests/parser/test_parsers.py | 4 +- 4 files changed, 315 insertions(+), 225 deletions(-) create mode 100644 pandas/io/tests/parser/dtypes.py diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3fe5e5e826ebd..37f6a02906a63 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,10 +17,14 @@ zip, string_types, map, u) from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, - is_float, - is_scalar) + is_float, is_dtype_equal, + is_object_dtype, + is_scalar, is_categorical_dtype) +from pandas.types.missing import isnull +from pandas.types.cast import _astype_nansafe from pandas.core.index import Index, MultiIndex, RangeIndex from pandas.core.frame import DataFrame +from pandas.core.categorical import Categorical from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser @@ -110,8 +114,9 @@ are duplicate names in the columns. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} - (Unsupported with engine='python'). Use `str` or `object` to preserve and - not interpret dtype. + Use `str` or `object` to preserve and not interpret dtype. + If converters are specified, they will be applied AFTER + dtype conversion. %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either @@ -420,6 +425,7 @@ def _read(filepath_or_buffer, kwds): 'true_values': None, 'false_values': None, 'converters': None, + 'dtype': None, 'skipfooter': 0, 'keep_default_na': True, @@ -460,7 +466,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines': None, 'error_bad_lines': True, 'warn_bad_lines': True, - 'dtype': None, 'float_precision': None } @@ -475,7 +480,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', - 'dtype', 'float_precision', ]) _deprecated_args = set([ @@ -833,9 +837,6 @@ def _clean_options(self, options, engine): " ignored as it is not supported by the 'python'" " engine.").format(reason=fallback_reason, option=arg) - if arg == 'dtype': - msg += " (Note the 'converters' option provides"\ - " similar functionality.)" raise ValueError(msg) del result[arg] @@ -1284,18 +1285,37 @@ def _agg_index(self, index, try_parse_dates=True): col_na_values, col_na_fvalues = _get_na_values( col_name, self.na_values, self.na_fvalues) - arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) + arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_names) return index + def _apply_converter(self, values, conv_f, na_values, col_na_values, + col_na_fvalues): + """ apply converter function to values, respecting NAs """ + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = lib.ismember(values, na_values).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_numeric=False) + return cvals, na_count + def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, - converters=None): + converters=None, dtypes=None): result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( @@ -1303,29 +1323,40 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: col_na_values, col_na_fvalues = set(), set() - coerce_type = True - if conv_f is not None: - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = lib.ismember(values, na_values).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - coerce_type = False - - cvals, na_count = self._convert_types( - values, set(col_na_values) | col_na_fvalues, coerce_type) + if conv_f is not None and cast_type is None: + # if type is not specified, apply the conversion first, without + # inference + cvals, na_count = self._apply_converter( + values, conv_f, na_values, + col_na_values, col_na_fvalues) + else: + # general type inference and conversion + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_numeric=True) if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( cvals, _parser.na_values, self.use_unsigned) + if cast_type and not is_dtype_equal(cvals, cast_type): + # type specificed in dtype param + + cvals = self._cast_types(cvals, cast_type, c) + # for consistency with c-parser, if a converter and dtype are + # specified, apply the converter last + if conv_f is not None: + values, na_count = self._apply_converter( + values, conv_f, na_values, + col_na_values, col_na_fvalues) + result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) return result - def _convert_types(self, values, na_values, try_num_bool=True): + def _infer_types(self, values, na_values, try_numeric=True): na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1336,9 +1367,10 @@ def _convert_types(self, values, na_values, try_num_bool=True): np.putmask(values, mask, np.nan) return values, na_count - if try_num_bool: + if try_numeric: try: result = lib.maybe_convert_numeric(values, na_values, False) + na_count = isnull(result).sum() except Exception: result = values if values.dtype == np.object_: @@ -1348,13 +1380,30 @@ def _convert_types(self, values, na_values, try_num_bool=True): if values.dtype == np.object_: na_count = lib.sanitize_objects(values, na_values, False) - if result.dtype == np.object_ and try_num_bool: + if result.dtype == np.object_ and try_numeric: result = lib.maybe_convert_bool(values, true_values=self.true_values, false_values=self.false_values) return result, na_count + def _cast_types(self, values, cast_type, column): + """ cast column to type specified in dtypes= param """ + if is_categorical_dtype(cast_type): + # XXX this is for consistency with + # c-parser which parses all categories + # as strings + if not is_object_dtype(values): + values = _astype_nansafe(values, str) + values = Categorical(values) + else: + try: + values = _astype_nansafe(values, cast_type, copy=True) + except ValueError: + raise ValueError("Unable to convert column %s to " + "type %s" % (column, cast_type)) + return values + def _do_date_conversions(self, names, data): # returns data, columns if self.parse_dates is not None: @@ -1783,6 +1832,7 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.dtype = kwds['dtype'] self.compact_ints = kwds['compact_ints'] self.use_unsigned = kwds['use_unsigned'] @@ -1981,7 +2031,7 @@ def read(self, rows=None): # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names) + names, self.index_col, self.index_names, self.dtype) columns = self._maybe_make_multi_index_columns( columns, self.col_names) return index, columns, col_dict @@ -2032,15 +2082,25 @@ def get_chunk(self, size=None): def _convert_data(self, data): # apply converters - clean_conv = {} - - for col, f in compat.iteritems(self.converters): - if isinstance(col, int) and col not in self.orig_names: - col = self.orig_names[col] - clean_conv[col] = f + def _clean_mapping(mapping): + "converts col numbers to names" + clean = {} + for col, v in compat.iteritems(mapping): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean[col] = v + return clean + + clean_conv = _clean_mapping(self.converters) + if not isinstance(self.dtype, dict): + # handles single dtype applied to all columns + clean_dtypes = self.dtype + else: + clean_dtypes = _clean_mapping(self.dtype) return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, - self.verbose, clean_conv) + self.verbose, clean_conv, + clean_dtypes) def _to_recarray(self, data, columns): dtypes = [] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 75b99654dbf89..0f23155464ad2 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -100,29 +100,13 @@ def test_dtype_and_names_error(self): self.read_csv(StringIO(data), sep=r'\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) - def test_passing_dtype(self): - # see gh-6607 + def test_unsupported_dtype(self): df = DataFrame(np.random.rand(5, 2), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + with tm.ensure_clean('__unsupported_dtype__.csv') as path: df.to_csv(path) - # see gh-3795: passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) - - # we expect all object columns, so need to - # convert to test for equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - # valid but we don't support it (date) self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, @@ -141,11 +125,6 @@ def test_passing_dtype(self): dtype={'A': 'U8'}, index_col=0) - # see gh-12048: empty frame - actual = self.read_csv(StringIO('A,B'), dtype=str) - expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) - tm.assert_frame_equal(actual, expected) - def test_precise_conversion(self): # see gh-8002 tm._skip_if_32bit() @@ -178,104 +157,6 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_pass_dtype(self): - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'object') - - def test_categorical_dtype(self): - # GH 10153 - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['a', 'a', 'b']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'a': 'category', - 'b': 'category', - 'c': CategoricalDtype()}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) - expected = pd.DataFrame({'a': [1, 1, 2], - 'b': Categorical(['a', 'a', 'b']), - 'c': [3.4, 3.4, 4.5]}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - # unsorted - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', 'b', 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - # missing - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', np.nan, 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_encoding(self): - # GH 10153 - pth = tm.get_data_path('unicode_series.csv') - encoding = 'latin-1' - expected = self.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - actual = self.read_csv(pth, header=None, encoding=encoding, - dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - pth = tm.get_data_path('utf16_ex.txt') - encoding = 'utf-16' - expected = self.read_table(pth, encoding=encoding) - expected = expected.apply(Categorical) - actual = self.read_table(pth, encoding=encoding, dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_chunksize(self): - # GH 10153 - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [pd.DataFrame({'a': [1, 1], - 'b': Categorical(['a', 'b'])}), - pd.DataFrame({'a': [1, 2], - 'b': Categorical(['b', 'c'])}, - index=[2, 3])] - actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, - chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( @@ -295,66 +176,6 @@ def test_pass_dtype_as_recarray(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1') - def test_empty_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) - - expected = DataFrame({'one': np.empty(0, dtype='u1'), - 'two': np.empty(0, dtype=np.object)}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_index_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), index_col=['one'], - dtype={'one': 'u1', 1: 'f'}) - - expected = DataFrame({'two': np.empty(0, dtype='f')}, - index=Index([], dtype='u1', name='one')) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_multiindex_pass_dtype(self): - data = 'one,two,three' - result = self.read_csv(StringIO(data), index_col=['one', 'two'], - dtype={'one': 'u1', 1: 'f8'}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), - np.empty(0, dtype='O')], - names=['one', 'two']) - expected = DataFrame( - {'three': np.empty(0, dtype=np.object)}, index=exp_idx) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={ - 'one': 'u1', 'one.1': 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_indexes(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # see gh-9424 - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one.1', dtype='f')], axis=1) - - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_usecols_dtypes(self): data = """\ 1,2,3 @@ -400,16 +221,6 @@ def test_custom_lineterminator(self): tm.assert_frame_equal(result, expected) - def test_raise_on_passed_int_dtype_with_nas(self): - # see gh-2631 - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - self.assertRaises(ValueError, self.read_csv, StringIO(data), - sep=",", skipinitialspace=True, - dtype={'DOY': np.int64}) - def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py new file mode 100644 index 0000000000000..4d796f00eec91 --- /dev/null +++ b/pandas/io/tests/parser/dtypes.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- + +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" + +from datetime import datetime + +import nose + +import numpy as np +import pandas as pd +import pandas.util.testing as tm + +from pandas.lib import Timestamp +from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas.compat import parse_date, StringIO, lmap +from pandas.types.dtypes import CategoricalDtype + + +class DtypeTests(object): + def test_passing_dtype(self): + # see gh-6607 + df = DataFrame(np.random.rand(5, 2), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # see gh-3795: passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes, Series( + {'A': 'object', 'B': 'object'})) + + # we expect all object columns, so need to + # convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result, df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'foo', 'B': 'float64'}, + index_col=0) + + # see gh-12048: empty frame + actual = self.read_csv(StringIO('A,B'), dtype=str) + expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) + tm.assert_frame_equal(actual, expected) + + def test_pass_dtype(self): + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'object') + + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', 'b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', np.nan, 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) + actual = self.read_table(pth, encoding=encoding, dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + def test_empty_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), + 'two': np.empty(0, dtype=np.object)}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_index_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), index_col=['one'], + dtype={'one': 'u1', 1: 'f'}) + + expected = DataFrame({'two': np.empty(0, dtype='f')}, + index=Index([], dtype='u1', name='one')) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_multiindex_pass_dtype(self): + data = 'one,two,three' + result = self.read_csv(StringIO(data), index_col=['one', 'two'], + dtype={'one': 'u1', 1: 'f8'}) + + exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), + np.empty(0, dtype='O')], + names=['one', 'two']) + expected = DataFrame( + {'three': np.empty(0, dtype=np.object)}, index=exp_idx) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 'one.1': 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_indexes(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_dup_column_pass_dtype_by_indexes(self): + # see gh-9424 + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one.1', dtype='f')], axis=1) + + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_raise_on_passed_int_dtype_with_nas(self): + # see gh-2631 + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + self.assertRaises(ValueError, self.read_csv, StringIO(data), + sep=",", skipinitialspace=True, + dtype={'DOY': np.int64}) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 6001c85ae76b1..6cca2e35e1135 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -22,6 +22,7 @@ from .compression import CompressionTests from .multithread import MultithreadTests from .python_parser_only import PythonParserTests +from .dtypes import DtypeTests class BaseParser(CommentTests, CompressionTests, @@ -29,7 +30,8 @@ class BaseParser(CommentTests, CompressionTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, - UsecolsTests, QuotingTests): + UsecolsTests, QuotingTests, + DtypeTests): def read_csv(self, *args, **kwargs): raise NotImplementedError From 7be7b423c1e673304e4e35a3bd4889cbc9ffc3af Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 24 Sep 2016 14:09:32 -0500 Subject: [PATCH 02/23] remove unsupported test --- pandas/io/tests/parser/test_unsupported.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 5d60c20854a83..ffd1cfa9a2538 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -44,16 +44,6 @@ def test_c_engine(self): data = 'a b c\n1 2 3' msg = 'does not support' - # specify C-unsupported options with python-unsupported option - # (options will be ignored on fallback, raise) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=None, - delim_whitespace=False, dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=r'\s', dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), skipfooter=1, dtype={'a': float}) - # specify C engine with unsupported options (raise) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', From 65a94ae85772ae5e32011739790a1551924bb4b1 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 25 Sep 2016 09:10:13 -0500 Subject: [PATCH 03/23] add test/fix for dtype=object --- pandas/io/parsers.py | 15 ++++++++++----- pandas/io/tests/parser/dtypes.py | 15 +++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 37f6a02906a63..ee20965ec50fb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1303,7 +1303,7 @@ def _apply_converter(self, values, conv_f, na_values, col_na_values, cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, - try_numeric=False) + try_num_bool=False) return cvals, na_count def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, @@ -1330,10 +1330,15 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, values, conv_f, na_values, col_na_values, col_na_fvalues) else: + try_num_bool = True + if cast_type and is_object_dtype(cast_type): + # skip inference if specified dtype is object + try_num_bool = False + # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, - try_numeric=True) + try_num_bool) if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( @@ -1356,7 +1361,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, print('Filled %d NA values in column %s' % (na_count, str(c))) return result - def _infer_types(self, values, na_values, try_numeric=True): + def _infer_types(self, values, na_values, try_num_bool=True): na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1367,7 +1372,7 @@ def _infer_types(self, values, na_values, try_numeric=True): np.putmask(values, mask, np.nan) return values, na_count - if try_numeric: + if try_num_bool: try: result = lib.maybe_convert_numeric(values, na_values, False) na_count = isnull(result).sum() @@ -1380,7 +1385,7 @@ def _infer_types(self, values, na_values, try_numeric=True): if values.dtype == np.object_: na_count = lib.sanitize_objects(values, na_values, False) - if result.dtype == np.object_ and try_numeric: + if result.dtype == np.object_ and try_num_bool: result = lib.maybe_convert_bool(values, true_values=self.true_values, false_values=self.false_values) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index 4d796f00eec91..a0a3b43279475 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -5,17 +5,12 @@ for all of the parsers defined in parsers.py """ -from datetime import datetime - -import nose - import numpy as np import pandas as pd import pandas.util.testing as tm -from pandas.lib import Timestamp from pandas import DataFrame, Series, Index, MultiIndex, Categorical -from pandas.compat import parse_date, StringIO, lmap +from pandas.compat import StringIO from pandas.types.dtypes import CategoricalDtype @@ -30,8 +25,12 @@ def test_passing_dtype(self): # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) + expected = df.astype(str) + tm.assert_frame_equal(result, expected) + + # for parsing, interpret object as str + result = self.read_csv(path, dtype=object, index_col=0) + tm.assert_frame_equal(result, expected) # we expect all object columns, so need to # convert to test for equivalence From 68535879314b7d90dbd47a46e28d06cd732da58c Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 25 Sep 2016 10:24:55 -0500 Subject: [PATCH 04/23] float precision... --- pandas/io/tests/parser/c_parser_only.py | 3 +-- pandas/io/tests/parser/dtypes.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 0f23155464ad2..c781b0549ee60 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,10 +12,9 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas import DataFrame from pandas import compat from pandas.compat import StringIO, range, lrange -from pandas.types.dtypes import CategoricalDtype class CParserTests(object): diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index a0a3b43279475..cf37dd97b9fc9 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -21,11 +21,11 @@ def test_passing_dtype(self): 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path) + df.to_csv(path, float_format='%.12f') # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - expected = df.astype(str) + expected = df.applymap(lambda x: '%.12f' % (x,)) tm.assert_frame_equal(result, expected) # for parsing, interpret object as str From 3024177264f2066c6cf8d9a5e5ade1695d53f7dc Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 26 Sep 2016 19:29:15 -0500 Subject: [PATCH 05/23] float precision fix --- pandas/io/tests/parser/dtypes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index cf37dd97b9fc9..cba293965e56b 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -17,15 +17,15 @@ class DtypeTests(object): def test_passing_dtype(self): # see gh-6607 - df = DataFrame(np.random.rand(5, 2), columns=list( + df = DataFrame(np.random.rand(5, 2).round(4), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path, float_format='%.12f') + df.to_csv(path) # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - expected = df.applymap(lambda x: '%.12f' % (x,)) + expected = df.astype(str) tm.assert_frame_equal(result, expected) # for parsing, interpret object as str From f9ff10edb9e21dd88b52802b77f92ef9eccfd4a3 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 30 Oct 2016 16:33:30 -0500 Subject: [PATCH 06/23] add docs; test for conv cast --- doc/source/io.rst | 9 ++-- doc/source/whatsnew/v0.20.0.txt | 9 ++++ pandas/io/parsers.py | 93 +++++++++++++++++++------------- pandas/io/tests/parser/dtypes.py | 10 ++++ 4 files changed, 79 insertions(+), 42 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ee319092c6dd5..03210ce3231b9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None`` Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` (unsupported with ``engine='python'``). Use `str` or `object` to preserve and not interpret dtype. + + .. versionadded:: 0.20.0 support for the Python parser. + engine : {``'c'``, ``'python'``} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -473,10 +476,8 @@ However, if you wanted for all the data to be coerced, no matter the type, then using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be worth trying. -.. note:: - The ``dtype`` option is currently only supported by the C engine. - Specifying ``dtype`` with ``engine`` other than 'c' raises a - ``ValueError``. + .. versionadded:: 0.20.0 support for the Python parser. + The ``dtype`` option is supported by the 'python' engine .. note:: In some cases, reading in abnormal data with columns containing mixed dtypes diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 581106924c77e..62000139234c8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -32,6 +32,15 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) +- The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns + is now supported with the ``'python'`` engine. See the :ref:`io docs ` for more information. + +.. ipython:: python + + from io import StringIO + data = "a,b\n1,2\n3,4" + pd.read_csv(StringIO(data), engine='python').dtypes + pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.api_breaking: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ee20965ec50fb..b3142b3ef740e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -115,8 +115,11 @@ dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} Use `str` or `object` to preserve and not interpret dtype. - If converters are specified, they will be applied AFTER - dtype conversion. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + + .. versionadded:: 0.20.0 support for the Python parser. + %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either @@ -1292,20 +1295,6 @@ def _agg_index(self, index, try_parse_dates=True): return index - def _apply_converter(self, values, conv_f, na_values, col_na_values, - col_na_fvalues): - """ apply converter function to values, respecting NAs """ - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = lib.ismember(values, na_values).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, - try_num_bool=False) - return cvals, na_count - def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None): result = {} @@ -1323,45 +1312,58 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: col_na_values, col_na_fvalues = set(), set() - if conv_f is not None and cast_type is None: - # if type is not specified, apply the conversion first, without - # inference - cvals, na_count = self._apply_converter( - values, conv_f, na_values, - col_na_values, col_na_fvalues) + if conv_f is not None: + # conv_f applied to data before inference + # dtype isn't used if a converted specified + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = lib.ismember(values, na_values).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_num_bool=False) else: - try_num_bool = True - if cast_type and is_object_dtype(cast_type): - # skip inference if specified dtype is object - try_num_bool = False + # skip inference if specified dtype is object + try_num_bool = not (cast_type and is_object_dtype(cast_type)) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) + # type specificed in dtype param + if cast_type and not is_dtype_equal(cvals, cast_type): + cvals = self._cast_types(cvals, cast_type, c) + if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( cvals, _parser.na_values, self.use_unsigned) - if cast_type and not is_dtype_equal(cvals, cast_type): - # type specificed in dtype param - - cvals = self._cast_types(cvals, cast_type, c) - # for consistency with c-parser, if a converter and dtype are - # specified, apply the converter last - if conv_f is not None: - values, na_count = self._apply_converter( - values, conv_f, na_values, - col_na_values, col_na_fvalues) - result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) return result def _infer_types(self, values, na_values, try_num_bool=True): + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns: + -------- + converted : ndarray + na_count : int + """ + na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1393,7 +1395,22 @@ def _infer_types(self, values, na_values, try_num_bool=True): return result, na_count def _cast_types(self, values, cast_type, column): - """ cast column to type specified in dtypes= param """ + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray + cast_type : string or np.dtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray + """ + if is_categorical_dtype(cast_type): # XXX this is for consistency with # c-parser which parses all categories diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index cba293965e56b..510efac80ee78 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -214,3 +214,13 @@ def test_raise_on_passed_int_dtype_with_nas(self): self.assertRaises(ValueError, self.read_csv, StringIO(data), sep=",", skipinitialspace=True, dtype={'DOY': np.int64}) + + def test_dtype_with_converter(self): + data = """a,b +1.1,2.2 +1.2,2.3""" + result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, + converters={'a': lambda x: str(x)}) + # dtype spec ignored if converted specified + expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) From f5b23a67b5490e181326eb533cb728a9a5832d71 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 5 Nov 2016 12:22:48 -0500 Subject: [PATCH 07/23] Add warning if both converter and dtype specified --- pandas/io/parsers.py | 7 +++- pandas/io/tests/parser/dtypes.py | 6 ++-- pandas/parser.pyx | 58 +++++++++++++++++--------------- 3 files changed, 41 insertions(+), 30 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b3142b3ef740e..8187b129d2702 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1314,7 +1314,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, if conv_f is not None: # conv_f applied to data before inference - # dtype isn't used if a converted specified + if cast_type is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(c), ParserWarning, + stacklevel=7) + try: values = lib.map_infer(values, conv_f) except ValueError: diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index 510efac80ee78..a2163aaf31ea8 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -12,6 +12,7 @@ from pandas import DataFrame, Series, Index, MultiIndex, Categorical from pandas.compat import StringIO from pandas.types.dtypes import CategoricalDtype +from pandas.io.common import ParserWarning class DtypeTests(object): @@ -219,8 +220,9 @@ def test_dtype_with_converter(self): data = """a,b 1.1,2.2 1.2,2.3""" - result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, - converters={'a': lambda x: str(x)}) # dtype spec ignored if converted specified + with tm.assert_produces_warning(ParserWarning): + result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, + converters={'a': lambda x: str(x)}) expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 6b43dfbabc4a0..ca9b34c06f025 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -13,7 +13,7 @@ from cpython cimport (PyObject, PyBytes_FromString, PyUnicode_Check, PyUnicode_AsUTF8String, PyErr_Occurred, PyErr_Fetch) from cpython.ref cimport PyObject, Py_XDECREF -from io.common import ParserError, DtypeWarning, EmptyDataError +from io.common import ParserError, DtypeWarning, EmptyDataError, ParserWarning # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. @@ -987,7 +987,7 @@ cdef class TextReader: Py_ssize_t i, nused kh_str_t *na_hashset = NULL int start, end - object name, na_flist + object name, na_flist, col_dtype = None bint na_filter = 0 Py_ssize_t num_cols @@ -1043,14 +1043,33 @@ cdef class TextReader: else: na_filter = 0 + col_dtype = None + if self.dtype is not None: + if isinstance(self.dtype, dict): + if name in self.dtype: + col_dtype = self.dtype[name] + elif i in self.dtype: + col_dtype = self.dtype[i] + else: + if self.dtype.names: + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) + else: + col_dtype = self.dtype + if conv: + if col_dtype is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(name), ParserWarning, + stacklevel=5) results[i] = _apply_converter(conv, self.parser, i, start, end, self.c_encoding) continue # Should return as the desired dtype (inferred or specified) col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, na_flist) + i, start, end, name, na_filter, na_hashset, na_flist, col_dtype) if na_filter: self._free_na_set(na_hashset) @@ -1075,32 +1094,17 @@ cdef class TextReader: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, kh_str_t *na_hashset, - object na_flist): - cdef: - object col_dtype = None - - if self.dtype is not None: - if isinstance(self.dtype, dict): - if name in self.dtype: - col_dtype = self.dtype[name] - elif i in self.dtype: - col_dtype = self.dtype[i] - else: - if self.dtype.names: - # structured array - col_dtype = np.dtype(self.dtype.descr[i][1]) - else: - col_dtype = self.dtype + object na_flist, object col_dtype): - if col_dtype is not None: - col_res, na_count = self._convert_with_dtype( - col_dtype, i, start, end, na_filter, - 1, na_hashset, na_flist) + if col_dtype is not None: + col_res, na_count = self._convert_with_dtype( + col_dtype, i, start, end, na_filter, + 1, na_hashset, na_flist) - # Fallback on the parse (e.g. we requested int dtype, - # but its actually a float). - if col_res is not None: - return col_res, na_count + # Fallback on the parse (e.g. we requested int dtype, + # but its actually a float). + if col_res is not None: + return col_res, na_count if i in self.noconvert: return self._string_convert(i, start, end, na_filter, na_hashset) From e0e5ae817a72e1fc8179dbeefd05f1881588a53b Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 13 Nov 2016 09:18:15 -0600 Subject: [PATCH 08/23] doc comments --- doc/source/whatsnew/v0.20.0.txt | 1 - pandas/io/parsers.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 62000139234c8..bef90b5392418 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -37,7 +37,6 @@ Other enhancements .. ipython:: python - from io import StringIO data = "a,b\n1,2\n3,4" pd.read_csv(StringIO(data), engine='python').dtypes pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8187b129d2702..31b3ea1ebf3c0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -117,9 +117,6 @@ Use `str` or `object` to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. - - .. versionadded:: 0.20.0 support for the Python parser. - %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either From a5821d3dfba3232395ee4b48267236691f202d6c Mon Sep 17 00:00:00 2001 From: Christopher Bartak Date: Wed, 23 Nov 2016 08:39:10 -0600 Subject: [PATCH 09/23] doc updates --- doc/source/io.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 03210ce3231b9..b1c151def26af 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -477,6 +477,7 @@ using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be worth trying. .. versionadded:: 0.20.0 support for the Python parser. + The ``dtype`` option is supported by the 'python' engine .. note:: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index bef90b5392418..30c80ca0a7523 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -22,8 +22,17 @@ New features ~~~~~~~~~~~~ +``read_csv`` supports ``dtype`` keyword for python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns + is now supported with the ``'python'`` engine. See the :ref:`io docs ` for more information. +.. ipython:: python + + data = "a,b\n1,2\n3,4" + pd.read_csv(StringIO(data), engine='python').dtypes + pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.enhancements.other: @@ -32,14 +41,6 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) -- The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns - is now supported with the ``'python'`` engine. See the :ref:`io docs ` for more information. - -.. ipython:: python - - data = "a,b\n1,2\n3,4" - pd.read_csv(StringIO(data), engine='python').dtypes - pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.api_breaking: From 7c703fe632c0a9a98ef648509d3c84ff0bc1a292 Mon Sep 17 00:00:00 2001 From: Christopher Bartak Date: Wed, 23 Nov 2016 09:34:53 -0600 Subject: [PATCH 10/23] lint --- pandas/io/tests/parser/dtypes.py | 2 +- pandas/parser.pyx | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index a2163aaf31ea8..058bfea7ae330 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -223,6 +223,6 @@ def test_dtype_with_converter(self): # dtype spec ignored if converted specified with tm.assert_produces_warning(ParserWarning): result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, - converters={'a': lambda x: str(x)}) + converters={'a': lambda x: str(x)}) expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index ca9b34c06f025..6760e822960f1 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1069,7 +1069,8 @@ cdef class TextReader: # Should return as the desired dtype (inferred or specified) col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, na_flist, col_dtype) + i, start, end, name, na_filter, na_hashset, + na_flist, col_dtype) if na_filter: self._free_na_set(na_hashset) From d790bdf1779a2f248aa290234d23abb1fec03043 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 24 Sep 2016 13:03:41 -0500 Subject: [PATCH 11/23] API: add dtype= option to python parser --- pandas/io/parsers.py | 126 ++++++++++---- pandas/io/tests/parser/c_parser_only.py | 193 +-------------------- pandas/io/tests/parser/dtypes.py | 217 ++++++++++++++++++++++++ pandas/io/tests/parser/test_parsers.py | 4 +- 4 files changed, 315 insertions(+), 225 deletions(-) create mode 100644 pandas/io/tests/parser/dtypes.py diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 929b360854d5b..d4e1a70240bb2 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,11 +17,15 @@ zip, string_types, map, u) from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, - is_float, - is_scalar) + is_float, is_dtype_equal, + is_object_dtype, + is_scalar, is_categorical_dtype) +from pandas.types.missing import isnull +from pandas.types.cast import _astype_nansafe from pandas.core.index import Index, MultiIndex, RangeIndex from pandas.core.series import Series from pandas.core.frame import DataFrame +from pandas.core.categorical import Categorical from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser @@ -111,8 +115,9 @@ are duplicate names in the columns. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} - (Unsupported with engine='python'). Use `str` or `object` to preserve and - not interpret dtype. + Use `str` or `object` to preserve and not interpret dtype. + If converters are specified, they will be applied AFTER + dtype conversion. %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either @@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds): 'true_values': None, 'false_values': None, 'converters': None, + 'dtype': None, 'skipfooter': 0, 'keep_default_na': True, @@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines': None, 'error_bad_lines': True, 'warn_bad_lines': True, - 'dtype': None, 'float_precision': None } @@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', - 'dtype', 'float_precision', ]) _deprecated_args = set([ @@ -834,9 +838,6 @@ def _clean_options(self, options, engine): " ignored as it is not supported by the 'python'" " engine.").format(reason=fallback_reason, option=arg) - if arg == 'dtype': - msg += " (Note the 'converters' option provides"\ - " similar functionality.)" raise ValueError(msg) del result[arg] @@ -1285,18 +1286,37 @@ def _agg_index(self, index, try_parse_dates=True): col_na_values, col_na_fvalues = _get_na_values( col_name, self.na_values, self.na_fvalues) - arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) + arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_names) return index + def _apply_converter(self, values, conv_f, na_values, col_na_values, + col_na_fvalues): + """ apply converter function to values, respecting NAs """ + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = lib.ismember(values, na_values).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_numeric=False) + return cvals, na_count + def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, - converters=None): + converters=None, dtypes=None): result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( @@ -1304,29 +1324,40 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: col_na_values, col_na_fvalues = set(), set() - coerce_type = True - if conv_f is not None: - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = lib.ismember(values, na_values).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - coerce_type = False - - cvals, na_count = self._convert_types( - values, set(col_na_values) | col_na_fvalues, coerce_type) + if conv_f is not None and cast_type is None: + # if type is not specified, apply the conversion first, without + # inference + cvals, na_count = self._apply_converter( + values, conv_f, na_values, + col_na_values, col_na_fvalues) + else: + # general type inference and conversion + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_numeric=True) if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( cvals, _parser.na_values, self.use_unsigned) + if cast_type and not is_dtype_equal(cvals, cast_type): + # type specificed in dtype param + + cvals = self._cast_types(cvals, cast_type, c) + # for consistency with c-parser, if a converter and dtype are + # specified, apply the converter last + if conv_f is not None: + values, na_count = self._apply_converter( + values, conv_f, na_values, + col_na_values, col_na_fvalues) + result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) return result - def _convert_types(self, values, na_values, try_num_bool=True): + def _infer_types(self, values, na_values, try_numeric=True): na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1337,9 +1368,10 @@ def _convert_types(self, values, na_values, try_num_bool=True): np.putmask(values, mask, np.nan) return values, na_count - if try_num_bool: + if try_numeric: try: result = lib.maybe_convert_numeric(values, na_values, False) + na_count = isnull(result).sum() except Exception: result = values if values.dtype == np.object_: @@ -1349,13 +1381,30 @@ def _convert_types(self, values, na_values, try_num_bool=True): if values.dtype == np.object_: na_count = lib.sanitize_objects(values, na_values, False) - if result.dtype == np.object_ and try_num_bool: + if result.dtype == np.object_ and try_numeric: result = lib.maybe_convert_bool(values, true_values=self.true_values, false_values=self.false_values) return result, na_count + def _cast_types(self, values, cast_type, column): + """ cast column to type specified in dtypes= param """ + if is_categorical_dtype(cast_type): + # XXX this is for consistency with + # c-parser which parses all categories + # as strings + if not is_object_dtype(values): + values = _astype_nansafe(values, str) + values = Categorical(values) + else: + try: + values = _astype_nansafe(values, cast_type, copy=True) + except ValueError: + raise ValueError("Unable to convert column %s to " + "type %s" % (column, cast_type)) + return values + def _do_date_conversions(self, names, data): # returns data, columns if self.parse_dates is not None: @@ -1784,6 +1833,7 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.dtype = kwds['dtype'] self.compact_ints = kwds['compact_ints'] self.use_unsigned = kwds['use_unsigned'] @@ -1982,7 +2032,7 @@ def read(self, rows=None): # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names) + names, self.index_col, self.index_names, self.dtype) columns = self._maybe_make_multi_index_columns( columns, self.col_names) return index, columns, col_dict @@ -2033,15 +2083,25 @@ def get_chunk(self, size=None): def _convert_data(self, data): # apply converters - clean_conv = {} - - for col, f in compat.iteritems(self.converters): - if isinstance(col, int) and col not in self.orig_names: - col = self.orig_names[col] - clean_conv[col] = f + def _clean_mapping(mapping): + "converts col numbers to names" + clean = {} + for col, v in compat.iteritems(mapping): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean[col] = v + return clean + + clean_conv = _clean_mapping(self.converters) + if not isinstance(self.dtype, dict): + # handles single dtype applied to all columns + clean_dtypes = self.dtype + else: + clean_dtypes = _clean_mapping(self.dtype) return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, - self.verbose, clean_conv) + self.verbose, clean_conv, + clean_dtypes) def _to_recarray(self, data, columns): dtypes = [] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 9cbe88d4032a3..d8926855ddca7 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -100,29 +100,13 @@ def test_dtype_and_names_error(self): self.read_csv(StringIO(data), sep=r'\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) - def test_passing_dtype(self): - # see gh-6607 + def test_unsupported_dtype(self): df = DataFrame(np.random.rand(5, 2), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + with tm.ensure_clean('__unsupported_dtype__.csv') as path: df.to_csv(path) - # see gh-3795: passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) - - # we expect all object columns, so need to - # convert to test for equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - # valid but we don't support it (date) self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, @@ -141,11 +125,6 @@ def test_passing_dtype(self): dtype={'A': 'U8'}, index_col=0) - # see gh-12048: empty frame - actual = self.read_csv(StringIO('A,B'), dtype=str) - expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) - tm.assert_frame_equal(actual, expected) - def test_precise_conversion(self): # see gh-8002 tm._skip_if_32bit() @@ -178,104 +157,6 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_pass_dtype(self): - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'object') - - def test_categorical_dtype(self): - # GH 10153 - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['a', 'a', 'b']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'a': 'category', - 'b': 'category', - 'c': CategoricalDtype()}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) - expected = pd.DataFrame({'a': [1, 1, 2], - 'b': Categorical(['a', 'a', 'b']), - 'c': [3.4, 3.4, 4.5]}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - # unsorted - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', 'b', 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - # missing - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', np.nan, 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_encoding(self): - # GH 10153 - pth = tm.get_data_path('unicode_series.csv') - encoding = 'latin-1' - expected = self.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - actual = self.read_csv(pth, header=None, encoding=encoding, - dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - pth = tm.get_data_path('utf16_ex.txt') - encoding = 'utf-16' - expected = self.read_table(pth, encoding=encoding) - expected = expected.apply(Categorical) - actual = self.read_table(pth, encoding=encoding, dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_chunksize(self): - # GH 10153 - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [pd.DataFrame({'a': [1, 1], - 'b': Categorical(['a', 'b'])}), - pd.DataFrame({'a': [1, 2], - 'b': Categorical(['b', 'c'])}, - index=[2, 3])] - actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, - chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( @@ -295,66 +176,6 @@ def test_pass_dtype_as_recarray(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1') - def test_empty_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) - - expected = DataFrame({'one': np.empty(0, dtype='u1'), - 'two': np.empty(0, dtype=np.object)}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_index_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), index_col=['one'], - dtype={'one': 'u1', 1: 'f'}) - - expected = DataFrame({'two': np.empty(0, dtype='f')}, - index=Index([], dtype='u1', name='one')) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_multiindex_pass_dtype(self): - data = 'one,two,three' - result = self.read_csv(StringIO(data), index_col=['one', 'two'], - dtype={'one': 'u1', 1: 'f8'}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), - np.empty(0, dtype='O')], - names=['one', 'two']) - expected = DataFrame( - {'three': np.empty(0, dtype=np.object)}, index=exp_idx) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={ - 'one': 'u1', 'one.1': 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_indexes(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # see gh-9424 - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one.1', dtype='f')], axis=1) - - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_usecols_dtypes(self): data = """\ 1,2,3 @@ -400,16 +221,6 @@ def test_custom_lineterminator(self): tm.assert_frame_equal(result, expected) - def test_raise_on_passed_int_dtype_with_nas(self): - # see gh-2631 - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - self.assertRaises(ValueError, self.read_csv, StringIO(data), - sep=",", skipinitialspace=True, - dtype={'DOY': np.int64}) - def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py new file mode 100644 index 0000000000000..4d796f00eec91 --- /dev/null +++ b/pandas/io/tests/parser/dtypes.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- + +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" + +from datetime import datetime + +import nose + +import numpy as np +import pandas as pd +import pandas.util.testing as tm + +from pandas.lib import Timestamp +from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas.compat import parse_date, StringIO, lmap +from pandas.types.dtypes import CategoricalDtype + + +class DtypeTests(object): + def test_passing_dtype(self): + # see gh-6607 + df = DataFrame(np.random.rand(5, 2), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # see gh-3795: passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes, Series( + {'A': 'object', 'B': 'object'})) + + # we expect all object columns, so need to + # convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result, df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'foo', 'B': 'float64'}, + index_col=0) + + # see gh-12048: empty frame + actual = self.read_csv(StringIO('A,B'), dtype=str) + expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) + tm.assert_frame_equal(actual, expected) + + def test_pass_dtype(self): + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'object') + + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', 'b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', np.nan, 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) + actual = self.read_table(pth, encoding=encoding, dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + def test_empty_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), + 'two': np.empty(0, dtype=np.object)}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_index_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), index_col=['one'], + dtype={'one': 'u1', 1: 'f'}) + + expected = DataFrame({'two': np.empty(0, dtype='f')}, + index=Index([], dtype='u1', name='one')) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_multiindex_pass_dtype(self): + data = 'one,two,three' + result = self.read_csv(StringIO(data), index_col=['one', 'two'], + dtype={'one': 'u1', 1: 'f8'}) + + exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), + np.empty(0, dtype='O')], + names=['one', 'two']) + expected = DataFrame( + {'three': np.empty(0, dtype=np.object)}, index=exp_idx) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 'one.1': 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_indexes(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_dup_column_pass_dtype_by_indexes(self): + # see gh-9424 + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one.1', dtype='f')], axis=1) + + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_raise_on_passed_int_dtype_with_nas(self): + # see gh-2631 + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + self.assertRaises(ValueError, self.read_csv, StringIO(data), + sep=",", skipinitialspace=True, + dtype={'DOY': np.int64}) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 6001c85ae76b1..6cca2e35e1135 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -22,6 +22,7 @@ from .compression import CompressionTests from .multithread import MultithreadTests from .python_parser_only import PythonParserTests +from .dtypes import DtypeTests class BaseParser(CommentTests, CompressionTests, @@ -29,7 +30,8 @@ class BaseParser(CommentTests, CompressionTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, - UsecolsTests, QuotingTests): + UsecolsTests, QuotingTests, + DtypeTests): def read_csv(self, *args, **kwargs): raise NotImplementedError From 5462774229b0ace0651951b7a84f4e33e9b715ec Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 24 Sep 2016 14:09:32 -0500 Subject: [PATCH 12/23] remove unsupported test --- pandas/io/tests/parser/test_unsupported.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 5d60c20854a83..ffd1cfa9a2538 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -44,16 +44,6 @@ def test_c_engine(self): data = 'a b c\n1 2 3' msg = 'does not support' - # specify C-unsupported options with python-unsupported option - # (options will be ignored on fallback, raise) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=None, - delim_whitespace=False, dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=r'\s', dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), skipfooter=1, dtype={'a': float}) - # specify C engine with unsupported options (raise) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', From 64c7214b457157913ea938682d3e50900265045e Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 25 Sep 2016 09:10:13 -0500 Subject: [PATCH 13/23] add test/fix for dtype=object --- pandas/io/parsers.py | 15 ++++++++++----- pandas/io/tests/parser/dtypes.py | 15 +++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d4e1a70240bb2..b4e203f9d0236 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1304,7 +1304,7 @@ def _apply_converter(self, values, conv_f, na_values, col_na_values, cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, - try_numeric=False) + try_num_bool=False) return cvals, na_count def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, @@ -1331,10 +1331,15 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, values, conv_f, na_values, col_na_values, col_na_fvalues) else: + try_num_bool = True + if cast_type and is_object_dtype(cast_type): + # skip inference if specified dtype is object + try_num_bool = False + # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, - try_numeric=True) + try_num_bool) if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( @@ -1357,7 +1362,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, print('Filled %d NA values in column %s' % (na_count, str(c))) return result - def _infer_types(self, values, na_values, try_numeric=True): + def _infer_types(self, values, na_values, try_num_bool=True): na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1368,7 +1373,7 @@ def _infer_types(self, values, na_values, try_numeric=True): np.putmask(values, mask, np.nan) return values, na_count - if try_numeric: + if try_num_bool: try: result = lib.maybe_convert_numeric(values, na_values, False) na_count = isnull(result).sum() @@ -1381,7 +1386,7 @@ def _infer_types(self, values, na_values, try_numeric=True): if values.dtype == np.object_: na_count = lib.sanitize_objects(values, na_values, False) - if result.dtype == np.object_ and try_numeric: + if result.dtype == np.object_ and try_num_bool: result = lib.maybe_convert_bool(values, true_values=self.true_values, false_values=self.false_values) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index 4d796f00eec91..a0a3b43279475 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -5,17 +5,12 @@ for all of the parsers defined in parsers.py """ -from datetime import datetime - -import nose - import numpy as np import pandas as pd import pandas.util.testing as tm -from pandas.lib import Timestamp from pandas import DataFrame, Series, Index, MultiIndex, Categorical -from pandas.compat import parse_date, StringIO, lmap +from pandas.compat import StringIO from pandas.types.dtypes import CategoricalDtype @@ -30,8 +25,12 @@ def test_passing_dtype(self): # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) + expected = df.astype(str) + tm.assert_frame_equal(result, expected) + + # for parsing, interpret object as str + result = self.read_csv(path, dtype=object, index_col=0) + tm.assert_frame_equal(result, expected) # we expect all object columns, so need to # convert to test for equivalence From 26f42c2c43366da2fb9f6f7af4fb2b43fb2b081f Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 25 Sep 2016 10:24:55 -0500 Subject: [PATCH 14/23] float precision... --- pandas/io/tests/parser/c_parser_only.py | 3 +-- pandas/io/tests/parser/dtypes.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index d8926855ddca7..2f2a3ab507f8f 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,10 +12,9 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas import DataFrame from pandas import compat from pandas.compat import StringIO, range, lrange -from pandas.types.dtypes import CategoricalDtype class CParserTests(object): diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index a0a3b43279475..cf37dd97b9fc9 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -21,11 +21,11 @@ def test_passing_dtype(self): 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path) + df.to_csv(path, float_format='%.12f') # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - expected = df.astype(str) + expected = df.applymap(lambda x: '%.12f' % (x,)) tm.assert_frame_equal(result, expected) # for parsing, interpret object as str From 7fbe0a3dc53a3121fdad6b12e14718ca48b20af6 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 26 Sep 2016 19:29:15 -0500 Subject: [PATCH 15/23] float precision fix --- pandas/io/tests/parser/dtypes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index cf37dd97b9fc9..cba293965e56b 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -17,15 +17,15 @@ class DtypeTests(object): def test_passing_dtype(self): # see gh-6607 - df = DataFrame(np.random.rand(5, 2), columns=list( + df = DataFrame(np.random.rand(5, 2).round(4), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path, float_format='%.12f') + df.to_csv(path) # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - expected = df.applymap(lambda x: '%.12f' % (x,)) + expected = df.astype(str) tm.assert_frame_equal(result, expected) # for parsing, interpret object as str From 08315b81883e8a46c20c6290eb56317c925ba7d8 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 30 Oct 2016 16:33:30 -0500 Subject: [PATCH 16/23] add docs; test for conv cast --- doc/source/io.rst | 9 ++-- doc/source/whatsnew/v0.20.0.txt | 9 ++++ pandas/io/parsers.py | 93 +++++++++++++++++++------------- pandas/io/tests/parser/dtypes.py | 10 ++++ 4 files changed, 79 insertions(+), 42 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ee319092c6dd5..03210ce3231b9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None`` Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` (unsupported with ``engine='python'``). Use `str` or `object` to preserve and not interpret dtype. + + .. versionadded:: 0.20.0 support for the Python parser. + engine : {``'c'``, ``'python'``} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -473,10 +476,8 @@ However, if you wanted for all the data to be coerced, no matter the type, then using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be worth trying. -.. note:: - The ``dtype`` option is currently only supported by the C engine. - Specifying ``dtype`` with ``engine`` other than 'c' raises a - ``ValueError``. + .. versionadded:: 0.20.0 support for the Python parser. + The ``dtype`` option is supported by the 'python' engine .. note:: In some cases, reading in abnormal data with columns containing mixed dtypes diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 65b62601c7022..ce36d4cf53601 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -32,6 +32,15 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) +- The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns + is now supported with the ``'python'`` engine. See the :ref:`io docs ` for more information. + +.. ipython:: python + + from io import StringIO + data = "a,b\n1,2\n3,4" + pd.read_csv(StringIO(data), engine='python').dtypes + pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.api_breaking: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b4e203f9d0236..d7a300ef7095a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -116,8 +116,11 @@ dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} Use `str` or `object` to preserve and not interpret dtype. - If converters are specified, they will be applied AFTER - dtype conversion. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + + .. versionadded:: 0.20.0 support for the Python parser. + %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either @@ -1293,20 +1296,6 @@ def _agg_index(self, index, try_parse_dates=True): return index - def _apply_converter(self, values, conv_f, na_values, col_na_values, - col_na_fvalues): - """ apply converter function to values, respecting NAs """ - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = lib.ismember(values, na_values).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, - try_num_bool=False) - return cvals, na_count - def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None): result = {} @@ -1324,45 +1313,58 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: col_na_values, col_na_fvalues = set(), set() - if conv_f is not None and cast_type is None: - # if type is not specified, apply the conversion first, without - # inference - cvals, na_count = self._apply_converter( - values, conv_f, na_values, - col_na_values, col_na_fvalues) + if conv_f is not None: + # conv_f applied to data before inference + # dtype isn't used if a converted specified + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = lib.ismember(values, na_values).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_num_bool=False) else: - try_num_bool = True - if cast_type and is_object_dtype(cast_type): - # skip inference if specified dtype is object - try_num_bool = False + # skip inference if specified dtype is object + try_num_bool = not (cast_type and is_object_dtype(cast_type)) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) + # type specificed in dtype param + if cast_type and not is_dtype_equal(cvals, cast_type): + cvals = self._cast_types(cvals, cast_type, c) + if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( cvals, _parser.na_values, self.use_unsigned) - if cast_type and not is_dtype_equal(cvals, cast_type): - # type specificed in dtype param - - cvals = self._cast_types(cvals, cast_type, c) - # for consistency with c-parser, if a converter and dtype are - # specified, apply the converter last - if conv_f is not None: - values, na_count = self._apply_converter( - values, conv_f, na_values, - col_na_values, col_na_fvalues) - result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) return result def _infer_types(self, values, na_values, try_num_bool=True): + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns: + -------- + converted : ndarray + na_count : int + """ + na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1394,7 +1396,22 @@ def _infer_types(self, values, na_values, try_num_bool=True): return result, na_count def _cast_types(self, values, cast_type, column): - """ cast column to type specified in dtypes= param """ + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray + cast_type : string or np.dtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray + """ + if is_categorical_dtype(cast_type): # XXX this is for consistency with # c-parser which parses all categories diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index cba293965e56b..510efac80ee78 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -214,3 +214,13 @@ def test_raise_on_passed_int_dtype_with_nas(self): self.assertRaises(ValueError, self.read_csv, StringIO(data), sep=",", skipinitialspace=True, dtype={'DOY': np.int64}) + + def test_dtype_with_converter(self): + data = """a,b +1.1,2.2 +1.2,2.3""" + result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, + converters={'a': lambda x: str(x)}) + # dtype spec ignored if converted specified + expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) From 810e750e2c19c4abd6ddd2253636e519cda6fee1 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 5 Nov 2016 12:22:48 -0500 Subject: [PATCH 17/23] Add warning if both converter and dtype specified --- pandas/io/parsers.py | 7 +++- pandas/io/tests/parser/dtypes.py | 6 ++-- pandas/parser.pyx | 58 +++++++++++++++++--------------- 3 files changed, 41 insertions(+), 30 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d7a300ef7095a..3f6a2e53343f3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1315,7 +1315,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, if conv_f is not None: # conv_f applied to data before inference - # dtype isn't used if a converted specified + if cast_type is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(c), ParserWarning, + stacklevel=7) + try: values = lib.map_infer(values, conv_f) except ValueError: diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index 510efac80ee78..a2163aaf31ea8 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -12,6 +12,7 @@ from pandas import DataFrame, Series, Index, MultiIndex, Categorical from pandas.compat import StringIO from pandas.types.dtypes import CategoricalDtype +from pandas.io.common import ParserWarning class DtypeTests(object): @@ -219,8 +220,9 @@ def test_dtype_with_converter(self): data = """a,b 1.1,2.2 1.2,2.3""" - result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, - converters={'a': lambda x: str(x)}) # dtype spec ignored if converted specified + with tm.assert_produces_warning(ParserWarning): + result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, + converters={'a': lambda x: str(x)}) expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 6b43dfbabc4a0..ca9b34c06f025 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -13,7 +13,7 @@ from cpython cimport (PyObject, PyBytes_FromString, PyUnicode_Check, PyUnicode_AsUTF8String, PyErr_Occurred, PyErr_Fetch) from cpython.ref cimport PyObject, Py_XDECREF -from io.common import ParserError, DtypeWarning, EmptyDataError +from io.common import ParserError, DtypeWarning, EmptyDataError, ParserWarning # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. @@ -987,7 +987,7 @@ cdef class TextReader: Py_ssize_t i, nused kh_str_t *na_hashset = NULL int start, end - object name, na_flist + object name, na_flist, col_dtype = None bint na_filter = 0 Py_ssize_t num_cols @@ -1043,14 +1043,33 @@ cdef class TextReader: else: na_filter = 0 + col_dtype = None + if self.dtype is not None: + if isinstance(self.dtype, dict): + if name in self.dtype: + col_dtype = self.dtype[name] + elif i in self.dtype: + col_dtype = self.dtype[i] + else: + if self.dtype.names: + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) + else: + col_dtype = self.dtype + if conv: + if col_dtype is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(name), ParserWarning, + stacklevel=5) results[i] = _apply_converter(conv, self.parser, i, start, end, self.c_encoding) continue # Should return as the desired dtype (inferred or specified) col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, na_flist) + i, start, end, name, na_filter, na_hashset, na_flist, col_dtype) if na_filter: self._free_na_set(na_hashset) @@ -1075,32 +1094,17 @@ cdef class TextReader: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, kh_str_t *na_hashset, - object na_flist): - cdef: - object col_dtype = None - - if self.dtype is not None: - if isinstance(self.dtype, dict): - if name in self.dtype: - col_dtype = self.dtype[name] - elif i in self.dtype: - col_dtype = self.dtype[i] - else: - if self.dtype.names: - # structured array - col_dtype = np.dtype(self.dtype.descr[i][1]) - else: - col_dtype = self.dtype + object na_flist, object col_dtype): - if col_dtype is not None: - col_res, na_count = self._convert_with_dtype( - col_dtype, i, start, end, na_filter, - 1, na_hashset, na_flist) + if col_dtype is not None: + col_res, na_count = self._convert_with_dtype( + col_dtype, i, start, end, na_filter, + 1, na_hashset, na_flist) - # Fallback on the parse (e.g. we requested int dtype, - # but its actually a float). - if col_res is not None: - return col_res, na_count + # Fallback on the parse (e.g. we requested int dtype, + # but its actually a float). + if col_res is not None: + return col_res, na_count if i in self.noconvert: return self._string_convert(i, start, end, na_filter, na_hashset) From 10f5be3516ab8e40c726e989107f8ed0bc90f228 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 13 Nov 2016 09:18:15 -0600 Subject: [PATCH 18/23] doc comments --- doc/source/whatsnew/v0.20.0.txt | 1 - pandas/io/parsers.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ce36d4cf53601..d6470d9e8fb52 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -37,7 +37,6 @@ Other enhancements .. ipython:: python - from io import StringIO data = "a,b\n1,2\n3,4" pd.read_csv(StringIO(data), engine='python').dtypes pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3f6a2e53343f3..0736535ce2d67 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -118,9 +118,6 @@ Use `str` or `object` to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. - - .. versionadded:: 0.20.0 support for the Python parser. - %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either From b2f7b94457eaa603137e8ff8a6e77f5b4319637c Mon Sep 17 00:00:00 2001 From: Christopher Bartak Date: Wed, 23 Nov 2016 08:39:10 -0600 Subject: [PATCH 19/23] doc updates --- doc/source/io.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 03210ce3231b9..b1c151def26af 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -477,6 +477,7 @@ using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be worth trying. .. versionadded:: 0.20.0 support for the Python parser. + The ``dtype`` option is supported by the 'python' engine .. note:: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d6470d9e8fb52..5a51887d4f983 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -22,8 +22,17 @@ New features ~~~~~~~~~~~~ +``read_csv`` supports ``dtype`` keyword for python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns + is now supported with the ``'python'`` engine. See the :ref:`io docs ` for more information. +.. ipython:: python + + data = "a,b\n1,2\n3,4" + pd.read_csv(StringIO(data), engine='python').dtypes + pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.enhancements.other: @@ -32,14 +41,6 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) -- The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns - is now supported with the ``'python'`` engine. See the :ref:`io docs ` for more information. - -.. ipython:: python - - data = "a,b\n1,2\n3,4" - pd.read_csv(StringIO(data), engine='python').dtypes - pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.api_breaking: From be2b43bf3bf998953a2a7dc3e30c285bcae92b70 Mon Sep 17 00:00:00 2001 From: Christopher Bartak Date: Wed, 23 Nov 2016 09:34:53 -0600 Subject: [PATCH 20/23] lint --- pandas/io/tests/parser/dtypes.py | 2 +- pandas/parser.pyx | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index a2163aaf31ea8..058bfea7ae330 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -223,6 +223,6 @@ def test_dtype_with_converter(self): # dtype spec ignored if converted specified with tm.assert_produces_warning(ParserWarning): result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, - converters={'a': lambda x: str(x)}) + converters={'a': lambda x: str(x)}) expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index ca9b34c06f025..6760e822960f1 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1069,7 +1069,8 @@ cdef class TextReader: # Should return as the desired dtype (inferred or specified) col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, na_flist, col_dtype) + i, start, end, name, na_filter, na_hashset, + na_flist, col_dtype) if na_filter: self._free_na_set(na_hashset) From 47669d3c8c379640a82c7f65341c80ecf540e743 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 24 Nov 2016 22:24:36 +0100 Subject: [PATCH 21/23] TST: move empty dtype tests from c_parser_only to dtype tests --- pandas/io/tests/parser/c_parser_only.py | 46 ------------------------- pandas/io/tests/parser/dtypes.py | 46 +++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 2f2a3ab507f8f..c781b0549ee60 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -371,49 +371,3 @@ def test_internal_null_byte(self): result = self.read_csv(StringIO(data), names=names) tm.assert_frame_equal(result, expected) - - def test_empty_dtype(self): - # see gh-14712 - data = 'a,b' - - expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) - result = self.read_csv(StringIO(data), header=0, dtype=np.float64) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Categorical([]), - 'b': pd.Categorical([])}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='category') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') - result = self.read_csv(StringIO(data), header=0, - dtype='datetime64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), - 'b': pd.Series([], dtype='timedelta64[ns]')}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='timedelta64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={0: np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.int32) - expected['b'] = expected['b'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.int32, 1: np.float64}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index 058bfea7ae330..18c37b31f6480 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -226,3 +226,49 @@ def test_dtype_with_converter(self): converters={'a': lambda x: str(x)}) expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) tm.assert_frame_equal(result, expected) + + def test_empty_dtype(self): + # see gh-14712 + data = 'a,b' + + expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) + result = self.read_csv(StringIO(data), header=0, dtype=np.float64) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Categorical([]), + 'b': pd.Categorical([])}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='category') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') + result = self.read_csv(StringIO(data), header=0, + dtype='datetime64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), + 'b': pd.Series([], dtype='timedelta64[ns]')}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='timedelta64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={0: np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.int32) + expected['b'] = expected['b'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.int32, 1: np.float64}) + tm.assert_frame_equal(result, expected) From 1706b39ad64ce75896bb680606fcd3aa4de3fffe Mon Sep 17 00:00:00 2001 From: Christopher Bartak Date: Fri, 25 Nov 2016 09:15:53 -0600 Subject: [PATCH 22/23] issue ref --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 5a51887d4f983..6e3559bee728d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -26,7 +26,7 @@ New features ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns - is now supported with the ``'python'`` engine. See the :ref:`io docs ` for more information. + is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. .. ipython:: python From 3abb0bd6e46e78557c1fd480ac173881dc5d530b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 25 Nov 2016 21:36:01 +0100 Subject: [PATCH 23/23] fix merge conflict leftover --- pandas/io/tests/parser/dtypes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index c0189050ee90b..18c37b31f6480 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -272,4 +272,3 @@ def test_empty_dtype(self): result = self.read_csv(StringIO(data), header=0, dtype={'a': np.int32, 1: np.float64}) tm.assert_frame_equal(result, expected) ->>>>>>> 47669d3c8c379640a82c7f65341c80ecf540e743